From 377852c3a4cbe72772f24508227ca3d6ec5969b8 Mon Sep 17 00:00:00 2001
From: Jeffrey Lee <jlee@gitlab.riscosopen.org>
Date: Sun, 8 May 2016 17:28:46 +0000
Subject: [PATCH] Prefer CPS over MSR for PSR manipulation. Use UDIV/SDIV for
 general-purpose division, and UMULL for /10.

Detail:
  s/k_body:
  - _kernel_irqs_off and _kernel_irqs_on now use CPS for disabling/enabling IRQs as opposed to MSR. Apart from being shorter code sequences, it's generally a faster instruction.
  - __rt_udiv and __rt_sdiv (and aliases) now use the UDIV and SDIV instructions if building for ARMv7VE
  - __rt_udiv10 and __rt_sdiv10 (and aliases) now use UMULL to multiply by 1/10 when building for targets with long multiply support, as this is faster than the old method. UDIV/SDIV can be fast too, but only for small numbers, making UMULL the best for the general case.
Admin:
  Tested on Cortex-A15
  Prototyping of division routines on assorted CPUs shows that UDIV/SDIV is generally between 20% and 400% faster than the old routine (Cortex-A7, Cortex-A53), or up to 1300% faster on Cortex-A15 (the CPU does not like the old routine!)
  Division by 10 is now about 20% faster across all appropriate CPUs


Version 5.89. Tagged as 'RISC_OSLib-5_89'
---
 VersionASM      | 12 +++++------
 VersionNum      | 20 ++++++++---------
 kernel/s/k_body | 57 +++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/VersionASM b/VersionASM
index f7aa9a6..15a1708 100644
--- a/VersionASM
+++ b/VersionASM
@@ -11,13 +11,13 @@
                         GBLS    Module_HelpVersion
                         GBLS    Module_ComponentName
                         GBLS    Module_ComponentPath
-Module_MajorVersion     SETS    "5.88"
-Module_Version          SETA    588
+Module_MajorVersion     SETS    "5.89"
+Module_Version          SETA    589
 Module_MinorVersion     SETS    ""
-Module_Date             SETS    "29 Feb 2016"
-Module_ApplicationDate  SETS    "29-Feb-16"
+Module_Date             SETS    "08 May 2016"
+Module_ApplicationDate  SETS    "08-May-16"
 Module_ComponentName    SETS    "RISC_OSLib"
 Module_ComponentPath    SETS    "castle/RiscOS/Sources/Lib/RISC_OSLib"
-Module_FullVersion      SETS    "5.88"
-Module_HelpVersion      SETS    "5.88 (29 Feb 2016)"
+Module_FullVersion      SETS    "5.89"
+Module_HelpVersion      SETS    "5.89 (08 May 2016)"
                         END
diff --git a/VersionNum b/VersionNum
index db8a580..50d1e11 100644
--- a/VersionNum
+++ b/VersionNum
@@ -1,23 +1,23 @@
-/* (5.88)
+/* (5.89)
  *
  * This file is automatically maintained by srccommit, do not edit manually.
  * Last processed by srccommit version: 1.1.
  *
  */
-#define Module_MajorVersion_CMHG        5.88
+#define Module_MajorVersion_CMHG        5.89
 #define Module_MinorVersion_CMHG        
-#define Module_Date_CMHG                29 Feb 2016
+#define Module_Date_CMHG                08 May 2016
 
-#define Module_MajorVersion             "5.88"
-#define Module_Version                  588
+#define Module_MajorVersion             "5.89"
+#define Module_Version                  589
 #define Module_MinorVersion             ""
-#define Module_Date                     "29 Feb 2016"
+#define Module_Date                     "08 May 2016"
 
-#define Module_ApplicationDate          "29-Feb-16"
+#define Module_ApplicationDate          "08-May-16"
 
 #define Module_ComponentName            "RISC_OSLib"
 #define Module_ComponentPath            "castle/RiscOS/Sources/Lib/RISC_OSLib"
 
-#define Module_FullVersion              "5.88"
-#define Module_HelpVersion              "5.88 (29 Feb 2016)"
-#define Module_LibraryVersionInfo       "5:88"
+#define Module_FullVersion              "5.89"
+#define Module_HelpVersion              "5.89 (08 May 2016)"
+#define Module_LibraryVersionInfo       "5:89"
diff --git a/kernel/s/k_body b/kernel/s/k_body
index 706519b..0dab27d 100644
--- a/kernel/s/k_body
+++ b/kernel/s/k_body
@@ -270,21 +270,29 @@ uwb_size        #       0
  [ {CONFIG}=26
         BICS    pc, lr, #PSRIBit        ; 32-bit OK - in {CONFIG}=26
  |
+   [ NoARMv6
         MRS     ip, CPSR
         BIC     ip, ip, #PSR32IBit
         MSR     CPSR_c, ip
+   |
+        CPSIE   i
+   ]
         Return  ,LinkNotStacked
  ]
 
 |_kernel_irqs_off|
-      [ {CONFIG}=26
+ [ {CONFIG}=26
         ORRS    pc, lr, #PSRIBit
-      |
+ |
+   [ NoARMv6
         MRS     ip, CPSR
         ORR     ip, ip, #PSR32IBit
         MSR     CPSR_c, ip
+   |
+        CPSID   i
+   ]
         Return  ,LinkNotStacked
-      ]
+ ]
 
 |_kernel_processor_mode|
       [ {CONFIG}=26
@@ -3132,7 +3140,7 @@ Sleep
 |x$udivide|
 ; Unsigned divide of a2 by a1: returns quotient in a1, remainder in a2
 ; Destroys a3 and ip
-
+      [ NoARMVE
         MOV     a3, #0
         RSBS    ip, a1, a2, LSR #3
         BCC     u_sh2
@@ -3180,6 +3188,14 @@ u_sh0   RSBS    ip, a1, a2
         ADCS    a3, a3, a3
         BCS     u_loop
         MOV     a1, a3
+      |
+; Long delay on UDIV result makes it faster to divide and then check for error
+        UDIV    a3, a2, a1
+        TEQ     a1, #0
+        BEQ     dividebyzero
+        MLS     a2, a3, a1, a2
+        MOV     a1, a3
+      ]
         Return  ,LinkNotStacked
 
 
@@ -3200,6 +3216,7 @@ u_sh0   RSBS    ip, a1, a2
 
 |__rt_udiv10|
 |_kernel_udiv10|
+      [ NoARMM
         SUB     a2, a1, #10
         SUB     a1, a1, a1, LSR #2
         ADD     a1, a1, a1, LSR #4
@@ -3210,6 +3227,16 @@ u_sh0   RSBS    ip, a1, a2
         SUBS    a2, a2, a3, LSL #1
         ADDPL   a1, a1, #1
         ADDMI   a2, a2, #10
+      |
+; For small numbers, UDIV would be faster than this, but not enough to make it
+; worth dynamically switching between algorithms.
+        LDR     a2, =&CCCCCCCD ; (8^32) / 10
+        UMULL   ip, a3, a2, a1
+        MOV     a3, a3, LSR #3 ; Accurate division by 10
+        SUB     a2, a1, a3, LSL #1
+        MOV     a1, a3
+        SUB     a2, a2, a3, LSL #3
+      ]
         Return  ,LinkNotStacked
 
 
@@ -3220,6 +3247,7 @@ u_sh0   RSBS    ip, a1, a2
 ; Quotient is truncated (rounded towards zero).
 ; Sign of remainder = sign of dividend.
 ; Destroys a3, a4 and ip
+      [ NoARMVE
 ; Negates dividend and divisor, then does an unsigned divide; signs
 ; get sorted out again at the end.
 
@@ -3275,6 +3303,13 @@ s_sh0   RSBS    ip, a1, a2
         EORS    a1, a3, a4, ASR #31
         ADD     a1, a1, a4, LSR #31
         RSBCS   a2, a2, #0
+      |
+        SDIV    a3, a2, a1
+        TEQ     a1, #0
+        BEQ     dividebyzero
+        MLS     a2, a3, a1, a2
+        MOV     a1, a3
+      ]
         Return  ,LinkNotStacked
 
 ; Signed remainder of a2 by a1: returns remainder in a1
@@ -3292,6 +3327,7 @@ s_sh0   RSBS    ip, a1, a2
 
 |__rt_sdiv10|
 |_kernel_sdiv10|
+      [ NoARMM
         MOVS    a4, a1
         RSBMI   a1, a1, #0
         SUB     a2, a1, #10
@@ -3307,6 +3343,19 @@ s_sh0   RSBS    ip, a1, a2
         MOVS    a4, a4
         RSBMI   a1, a1, #0
         RSBMI   a2, a2, #0
+      |
+; Using SMULL here would be tricky due to the need to round towards zero
+        MOVS    a4, a1
+        LDR     a2, =&CCCCCCCD ; (8^32) / 10
+        RSBMI   a1, a1, #0
+        UMULL   ip, a3, a2, a1
+        MOV     a3, a3, LSR #3 ; Accurate division by 10
+        SUB     a2, a1, a3, LSL #1
+        MOV     a1, a3
+        SUB     a2, a2, a3, LSL #3
+        RSBMI   a1, a1, #0
+        RSBMI   a2, a2, #0
+      ]
         Return  ,LinkNotStacked
 
         EXPORT  __rt_div0
-- 
GitLab