From 377852c3a4cbe72772f24508227ca3d6ec5969b8 Mon Sep 17 00:00:00 2001 From: Jeffrey Lee <jlee@gitlab.riscosopen.org> Date: Sun, 8 May 2016 17:28:46 +0000 Subject: [PATCH] Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose division, and UMULL for /10. Detail: s/k_body: - _kernel_irqs_off and _kernel_irqs_on now use CPS for disabling/enabling IRQs as opposed to MSR. Apart from being shorter code sequences, it's generally a faster instruction. - __rt_udiv and __rt_sdiv (and aliases) now use the UDIV and SDIV instructions if building for ARMv7VE - __rt_udiv10 and __rt_sdiv10 (and aliases) now use UMULL to multiply by 1/10 when building for targets with long multiply support, as this is faster than the old method. UDIV/SDIV can be fast too, but only for small numbers, making UMULL the best for the general case. Admin: Tested on Cortex-A15 Prototyping of division routines on assorted CPUs shows that UDIV/SDIV is generally between 20% and 400% faster than the old routine (Cortex-A7, Cortex-A53), or up to 1300% faster on Cortex-A15 (the CPU does not like the old routine!) Division by 10 is now about 20% faster across all appropriate CPUs Version 5.89. Tagged as 'RISC_OSLib-5_89' --- VersionASM | 12 +++++------ VersionNum | 20 ++++++++--------- kernel/s/k_body | 57 +++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 69 insertions(+), 20 deletions(-) diff --git a/VersionASM b/VersionASM index f7aa9a6..15a1708 100644 --- a/VersionASM +++ b/VersionASM @@ -11,13 +11,13 @@ GBLS Module_HelpVersion GBLS Module_ComponentName GBLS Module_ComponentPath -Module_MajorVersion SETS "5.88" -Module_Version SETA 588 +Module_MajorVersion SETS "5.89" +Module_Version SETA 589 Module_MinorVersion SETS "" -Module_Date SETS "29 Feb 2016" -Module_ApplicationDate SETS "29-Feb-16" +Module_Date SETS "08 May 2016" +Module_ApplicationDate SETS "08-May-16" Module_ComponentName SETS "RISC_OSLib" Module_ComponentPath SETS "castle/RiscOS/Sources/Lib/RISC_OSLib" -Module_FullVersion SETS "5.88" -Module_HelpVersion SETS "5.88 (29 Feb 2016)" +Module_FullVersion SETS "5.89" +Module_HelpVersion SETS "5.89 (08 May 2016)" END diff --git a/VersionNum b/VersionNum index db8a580..50d1e11 100644 --- a/VersionNum +++ b/VersionNum @@ -1,23 +1,23 @@ -/* (5.88) +/* (5.89) * * This file is automatically maintained by srccommit, do not edit manually. * Last processed by srccommit version: 1.1. * */ -#define Module_MajorVersion_CMHG 5.88 +#define Module_MajorVersion_CMHG 5.89 #define Module_MinorVersion_CMHG -#define Module_Date_CMHG 29 Feb 2016 +#define Module_Date_CMHG 08 May 2016 -#define Module_MajorVersion "5.88" -#define Module_Version 588 +#define Module_MajorVersion "5.89" +#define Module_Version 589 #define Module_MinorVersion "" -#define Module_Date "29 Feb 2016" +#define Module_Date "08 May 2016" -#define Module_ApplicationDate "29-Feb-16" +#define Module_ApplicationDate "08-May-16" #define Module_ComponentName "RISC_OSLib" #define Module_ComponentPath "castle/RiscOS/Sources/Lib/RISC_OSLib" -#define Module_FullVersion "5.88" -#define Module_HelpVersion "5.88 (29 Feb 2016)" -#define Module_LibraryVersionInfo "5:88" +#define Module_FullVersion "5.89" +#define Module_HelpVersion "5.89 (08 May 2016)" +#define Module_LibraryVersionInfo "5:89" diff --git a/kernel/s/k_body b/kernel/s/k_body index 706519b..0dab27d 100644 --- a/kernel/s/k_body +++ b/kernel/s/k_body @@ -270,21 +270,29 @@ uwb_size # 0 [ {CONFIG}=26 BICS pc, lr, #PSRIBit ; 32-bit OK - in {CONFIG}=26 | + [ NoARMv6 MRS ip, CPSR BIC ip, ip, #PSR32IBit MSR CPSR_c, ip + | + CPSIE i + ] Return ,LinkNotStacked ] |_kernel_irqs_off| - [ {CONFIG}=26 + [ {CONFIG}=26 ORRS pc, lr, #PSRIBit - | + | + [ NoARMv6 MRS ip, CPSR ORR ip, ip, #PSR32IBit MSR CPSR_c, ip + | + CPSID i + ] Return ,LinkNotStacked - ] + ] |_kernel_processor_mode| [ {CONFIG}=26 @@ -3132,7 +3140,7 @@ Sleep |x$udivide| ; Unsigned divide of a2 by a1: returns quotient in a1, remainder in a2 ; Destroys a3 and ip - + [ NoARMVE MOV a3, #0 RSBS ip, a1, a2, LSR #3 BCC u_sh2 @@ -3180,6 +3188,14 @@ u_sh0 RSBS ip, a1, a2 ADCS a3, a3, a3 BCS u_loop MOV a1, a3 + | +; Long delay on UDIV result makes it faster to divide and then check for error + UDIV a3, a2, a1 + TEQ a1, #0 + BEQ dividebyzero + MLS a2, a3, a1, a2 + MOV a1, a3 + ] Return ,LinkNotStacked @@ -3200,6 +3216,7 @@ u_sh0 RSBS ip, a1, a2 |__rt_udiv10| |_kernel_udiv10| + [ NoARMM SUB a2, a1, #10 SUB a1, a1, a1, LSR #2 ADD a1, a1, a1, LSR #4 @@ -3210,6 +3227,16 @@ u_sh0 RSBS ip, a1, a2 SUBS a2, a2, a3, LSL #1 ADDPL a1, a1, #1 ADDMI a2, a2, #10 + | +; For small numbers, UDIV would be faster than this, but not enough to make it +; worth dynamically switching between algorithms. + LDR a2, =&CCCCCCCD ; (8^32) / 10 + UMULL ip, a3, a2, a1 + MOV a3, a3, LSR #3 ; Accurate division by 10 + SUB a2, a1, a3, LSL #1 + MOV a1, a3 + SUB a2, a2, a3, LSL #3 + ] Return ,LinkNotStacked @@ -3220,6 +3247,7 @@ u_sh0 RSBS ip, a1, a2 ; Quotient is truncated (rounded towards zero). ; Sign of remainder = sign of dividend. ; Destroys a3, a4 and ip + [ NoARMVE ; Negates dividend and divisor, then does an unsigned divide; signs ; get sorted out again at the end. @@ -3275,6 +3303,13 @@ s_sh0 RSBS ip, a1, a2 EORS a1, a3, a4, ASR #31 ADD a1, a1, a4, LSR #31 RSBCS a2, a2, #0 + | + SDIV a3, a2, a1 + TEQ a1, #0 + BEQ dividebyzero + MLS a2, a3, a1, a2 + MOV a1, a3 + ] Return ,LinkNotStacked ; Signed remainder of a2 by a1: returns remainder in a1 @@ -3292,6 +3327,7 @@ s_sh0 RSBS ip, a1, a2 |__rt_sdiv10| |_kernel_sdiv10| + [ NoARMM MOVS a4, a1 RSBMI a1, a1, #0 SUB a2, a1, #10 @@ -3307,6 +3343,19 @@ s_sh0 RSBS ip, a1, a2 MOVS a4, a4 RSBMI a1, a1, #0 RSBMI a2, a2, #0 + | +; Using SMULL here would be tricky due to the need to round towards zero + MOVS a4, a1 + LDR a2, =&CCCCCCCD ; (8^32) / 10 + RSBMI a1, a1, #0 + UMULL ip, a3, a2, a1 + MOV a3, a3, LSR #3 ; Accurate division by 10 + SUB a2, a1, a3, LSL #1 + MOV a1, a3 + SUB a2, a2, a3, LSL #3 + RSBMI a1, a1, #0 + RSBMI a2, a2, #0 + ] Return ,LinkNotStacked EXPORT __rt_div0 -- GitLab