Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose...

Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose division, and UMULL for /10. Detail: s/k_body: - _kernel_irqs_off and _kernel_irqs_on now use CPS for disabling/enabling IRQs as opposed to MSR. Apart from being shorter code sequences, it's generally a faster instruction. - __rt_udiv and __rt_sdiv (and aliases) now use the UDIV and SDIV instructions if building for ARMv7VE - __rt_udiv10 and __rt_sdiv10 (and aliases) now use UMULL to multiply by 1/10 when building for targets with long multiply support, as this is faster than the old method. UDIV/SDIV can be fast too, but only for small numbers, making UMULL the best for the general case. Admin: Tested on Cortex-A15 Prototyping of division routines on assorted CPUs shows that UDIV/SDIV is generally between 20% and 400% faster than the old routine (Cortex-A7, Cortex-A53), or up to 1300% faster on Cortex-A15 (the CPU does not like the old routine!) Division by 10 is now about 20% faster across all appropriate CPUs Version 5.89. Tagged as 'RISC_OSLib-5_89'

Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose...
Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for general-purpose division, and UMULL for /10. Detail: s/k_body: - _kernel_irqs_off and _kernel_irqs_on now use CPS for disabling/enabling IRQs as opposed to MSR. Apart from being shorter code sequences, it's generally a faster instruction. - __rt_udiv and __rt_sdiv (and aliases) now use the UDIV and SDIV instructions if building for ARMv7VE - __rt_udiv10 and __rt_sdiv10 (and aliases) now use UMULL to multiply by 1/10 when building for targets with long multiply support, as this is faster than the old method. UDIV/SDIV can be fast too, but only for small numbers, making UMULL the best for the general case. Admin: Tested on Cortex-A15 Prototyping of division routines on assorted CPUs shows that UDIV/SDIV is generally between 20% and 400% faster than the old routine (Cortex-A7, Cortex-A53), or up to 1300% faster on Cortex-A15 (the CPU does not like the old routine!) Division by 10 is now about 20% faster across all appropriate CPUs Version 5.89. Tagged as 'RISC_OSLib-5_89'
377852c3 · Jeffrey Lee · 90c0b2a6 · 377852c3 · 377852c3 · 377852c3
Commit 377852c3 authored 8 years ago by Jeffrey Lee
Hide whitespace changes
Inline Side-by-side

Showing with 69 additions and 20 deletions

VersionASM VersionASM +6 -6

VersionNum VersionNum +10 -10

kernel/s/k_body kernel/s/k_body +53 -4

No files found.
--- a/VersionASM
+++ b/VersionASM
@@ -11,13 +11,13 @@
                        GBLS    Module_HelpVersion
                        GBLS    Module_ComponentName
                        GBLS    Module_ComponentPath
-Module_MajorVersion     SETS    "5.88"
-Module_Version          SETA    588
+Module_MajorVersion     SETS    "5.89"
+Module_Version          SETA    589
 Module_MinorVersion     SETS    ""
-Module_Date             SETS    "29 Feb 2016"
-Module_ApplicationDate  SETS    "29-Feb-16"
+Module_Date             SETS    "08 May 2016"
+Module_ApplicationDate  SETS    "08-May-16"
 Module_ComponentName    SETS    "RISC_OSLib"
 Module_ComponentPath    SETS    "castle/RiscOS/Sources/Lib/RISC_OSLib"
-Module_FullVersion      SETS    "5.88"
-Module_HelpVersion      SETS    "5.88 (29 Feb 2016)"
+Module_FullVersion      SETS    "5.89"
+Module_HelpVersion      SETS    "5.89 (08 May 2016)"
                        END
--- a/VersionNum
+++ b/VersionNum
-/* (5.88)
+/* (5.89)
 *
 * This file is automatically maintained by srccommit, do not edit manually.
 * Last processed by srccommit version: 1.1.
 *
 */
-#define Module_MajorVersion_CMHG        5.88
+#define Module_MajorVersion_CMHG        5.89
 #define Module_MinorVersion_CMHG        
-#define Module_Date_CMHG                29 Feb 2016
+#define Module_Date_CMHG                08 May 2016

-#define Module_MajorVersion             "5.88"
-#define Module_Version                  588
+#define Module_MajorVersion             "5.89"
+#define Module_Version                  589
 #define Module_MinorVersion             ""
-#define Module_Date                     "29 Feb 2016"
+#define Module_Date                     "08 May 2016"

-#define Module_ApplicationDate          "29-Feb-16"
+#define Module_ApplicationDate          "08-May-16"

 #define Module_ComponentName            "RISC_OSLib"
 #define Module_ComponentPath            "castle/RiscOS/Sources/Lib/RISC_OSLib"

-#define Module_FullVersion              "5.88"
-#define Module_HelpVersion              "5.88 (29 Feb 2016)"
-#define Module_LibraryVersionInfo       "5:88"
+#define Module_FullVersion              "5.89"
+#define Module_HelpVersion              "5.89 (08 May 2016)"
+#define Module_LibraryVersionInfo       "5:89"
--- a/kernel/s/k_body
+++ b/kernel/s/k_body
@@ -270,21 +270,29 @@ uwb_size        #       0
 [ {CONFIG}=26
        BICS    pc, lr, #PSRIBit        ; 32-bit OK - in {CONFIG}=26
 |
+   [ NoARMv6
        MRS     ip, CPSR
        BIC     ip, ip, #PSR32IBit
        MSR     CPSR_c, ip
+   |
+        CPSIE   i
+   ]
        Return  ,LinkNotStacked
 ]

 |_kernel_irqs_off|
-      [ {CONFIG}=26
+ [ {CONFIG}=26
        ORRS    pc, lr, #PSRIBit
-      |
+ |
+   [ NoARMv6
        MRS     ip, CPSR
        ORR     ip, ip, #PSR32IBit
        MSR     CPSR_c, ip
+   |
+        CPSID   i
+   ]
        Return  ,LinkNotStacked
-      ]
+ ]

 |_kernel_processor_mode|
      [ {CONFIG}=26
@@ -3132,7 +3140,7 @@ Sleep
 |x$udivide|
 ; Unsigned divide of a2 by a1: returns quotient in a1, remainder in a2
 ; Destroys a3 and ip
-
+      [ NoARMVE
        MOV     a3, #0
        RSBS    ip, a1, a2, LSR #3
        BCC     u_sh2
@@ -3180,6 +3188,14 @@ u_sh0   RSBS    ip, a1, a2
        ADCS    a3, a3, a3
        BCS     u_loop
        MOV     a1, a3
+      |
+; Long delay on UDIV result makes it faster to divide and then check for error
+        UDIV    a3, a2, a1
+        TEQ     a1, #0
+        BEQ     dividebyzero
+        MLS     a2, a3, a1, a2
+        MOV     a1, a3
+      ]
        Return  ,LinkNotStacked


@@ -3200,6 +3216,7 @@ u_sh0   RSBS    ip, a1, a2

 |__rt_udiv10|
 |_kernel_udiv10|
+      [ NoARMM
        SUB     a2, a1, #10
        SUB     a1, a1, a1, LSR #2
        ADD     a1, a1, a1, LSR #4
@@ -3210,6 +3227,16 @@ u_sh0   RSBS    ip, a1, a2
        SUBS    a2, a2, a3, LSL #1
        ADDPL   a1, a1, #1
        ADDMI   a2, a2, #10
+      |
+; For small numbers, UDIV would be faster than this, but not enough to make it
+; worth dynamically switching between algorithms.
+        LDR     a2, =&CCCCCCCD ; (8^32) / 10
+        UMULL   ip, a3, a2, a1
+        MOV     a3, a3, LSR #3 ; Accurate division by 10
+        SUB     a2, a1, a3, LSL #1
+        MOV     a1, a3
+        SUB     a2, a2, a3, LSL #3
+      ]
        Return  ,LinkNotStacked


@@ -3220,6 +3247,7 @@ u_sh0   RSBS    ip, a1, a2
 ; Quotient is truncated (rounded towards zero).
 ; Sign of remainder = sign of dividend.
 ; Destroys a3, a4 and ip
+      [ NoARMVE
 ; Negates dividend and divisor, then does an unsigned divide; signs
 ; get sorted out again at the end.

@@ -3275,6 +3303,13 @@ s_sh0   RSBS    ip, a1, a2
        EORS    a1, a3, a4, ASR #31
        ADD     a1, a1, a4, LSR #31
        RSBCS   a2, a2, #0
+      |
+        SDIV    a3, a2, a1
+        TEQ     a1, #0
+        BEQ     dividebyzero
+        MLS     a2, a3, a1, a2
+        MOV     a1, a3
+      ]
        Return  ,LinkNotStacked

 ; Signed remainder of a2 by a1: returns remainder in a1
@@ -3292,6 +3327,7 @@ s_sh0   RSBS    ip, a1, a2

 |__rt_sdiv10|
 |_kernel_sdiv10|
+      [ NoARMM
        MOVS    a4, a1
        RSBMI   a1, a1, #0
        SUB     a2, a1, #10
@@ -3307,6 +3343,19 @@ s_sh0   RSBS    ip, a1, a2
        MOVS    a4, a4
        RSBMI   a1, a1, #0
        RSBMI   a2, a2, #0
+      |
+; Using SMULL here would be tricky due to the need to round towards zero
+        MOVS    a4, a1
+        LDR     a2, =&CCCCCCCD ; (8^32) / 10
+        RSBMI   a1, a1, #0
+        UMULL   ip, a3, a2, a1
+        MOV     a3, a3, LSR #3 ; Accurate division by 10
+        SUB     a2, a1, a3, LSL #1
+        MOV     a1, a3
+        SUB     a2, a2, a3, LSL #3
+        RSBMI   a1, a1, #0
+        RSBMI   a2, a2, #0
+      ]
        Return  ,LinkNotStacked

        EXPORT  __rt_div0