From 2dfd92c1b25050fb6a2a96f921e4d56da61f90a5 Mon Sep 17 00:00:00 2001
From: Jeffrey Lee <jlee@gitlab.riscosopen.org>
Date: Mon, 12 Sep 2011 20:31:39 +0000
Subject: [PATCH] ARMv7 fixes

Detail:
  hdr/Copro15ops:
    - Fixed incorrect encodings of ISH/ISHST variants of DMB/DSB instructions
  s/ARMops, s/HAL, hdr/KernelWS:
    - Replace the ARMv7 cache maintenance code with the example code from the ARMv7 ARM. This allows it to deal with caches with non power-of-two set/way counts, and caches with only one way.
    - Fixed Analyse_WB_CR7_Lx to use the cache level ID register to work out how many caches to query instead of just looking for a 0 result from CSSIDR.
    - Also only look for 7 cache levels, since level 8 doesn't exist according to the ARMv7 ARM.
  s/NewReset:
    - Removed some incorrect/misleading debug output
Admin:
  Tested on rev A2 BB-xM


Version 5.35, 4.79.2.98.2.51. Tagged as 'Kernel-5_35-4_79_2_98_2_51'
---
 VersionASM     |   6 +-
 VersionNum     |   8 +-
 hdr/Copro15ops |   8 +-
 hdr/KernelWS   |   4 +-
 s/ARMops       | 303 ++++++++++++++++++++-----------------------------
 s/HAL          |  94 +++++++--------
 s/NewReset     |   4 +-
 7 files changed, 187 insertions(+), 240 deletions(-)

diff --git a/VersionASM b/VersionASM
index 4a78938..bee0d80 100644
--- a/VersionASM
+++ b/VersionASM
@@ -13,11 +13,11 @@
                         GBLS    Module_ComponentPath
 Module_MajorVersion     SETS    "5.35"
 Module_Version          SETA    535
-Module_MinorVersion     SETS    "4.79.2.98.2.50"
+Module_MinorVersion     SETS    "4.79.2.98.2.51"
 Module_Date             SETS    "12 Sep 2011"
 Module_ApplicationDate  SETS    "12-Sep-11"
 Module_ComponentName    SETS    "Kernel"
 Module_ComponentPath    SETS    "castle/RiscOS/Sources/Kernel"
-Module_FullVersion      SETS    "5.35 (4.79.2.98.2.50)"
-Module_HelpVersion      SETS    "5.35 (12 Sep 2011) 4.79.2.98.2.50"
+Module_FullVersion      SETS    "5.35 (4.79.2.98.2.51)"
+Module_HelpVersion      SETS    "5.35 (12 Sep 2011) 4.79.2.98.2.51"
                         END
diff --git a/VersionNum b/VersionNum
index ef1b274..bd8426e 100644
--- a/VersionNum
+++ b/VersionNum
@@ -5,12 +5,12 @@
  *
  */
 #define Module_MajorVersion_CMHG        5.35
-#define Module_MinorVersion_CMHG        4.79.2.98.2.50
+#define Module_MinorVersion_CMHG        4.79.2.98.2.51
 #define Module_Date_CMHG                12 Sep 2011
 
 #define Module_MajorVersion             "5.35"
 #define Module_Version                  535
-#define Module_MinorVersion             "4.79.2.98.2.50"
+#define Module_MinorVersion             "4.79.2.98.2.51"
 #define Module_Date                     "12 Sep 2011"
 
 #define Module_ApplicationDate          "12-Sep-11"
@@ -18,6 +18,6 @@
 #define Module_ComponentName            "Kernel"
 #define Module_ComponentPath            "castle/RiscOS/Sources/Kernel"
 
-#define Module_FullVersion              "5.35 (4.79.2.98.2.50)"
-#define Module_HelpVersion              "5.35 (12 Sep 2011) 4.79.2.98.2.50"
+#define Module_FullVersion              "5.35 (4.79.2.98.2.51)"
+#define Module_HelpVersion              "5.35 (12 Sep 2011) 4.79.2.98.2.51"
 #define Module_LibraryVersionInfo       "5:35"
diff --git a/hdr/Copro15ops b/hdr/Copro15ops
index 5e7ddef..ea94c34 100644
--- a/hdr/Copro15ops
+++ b/hdr/Copro15ops
@@ -605,10 +605,10 @@ C15 CN 15
         DCI &F57FF04E ; DSB ST
        |
        [ "$option"="ISH"
-        DCI &F57FF04D ; DSB ISH
+        DCI &F57FF04B ; DSB ISH
        |
        [ "$option"="ISHST"
-        DCI &F57FF04C ; DSB ISHST
+        DCI &F57FF04A ; DSB ISHST
        |
        [ "$option"="NSH"
         DCI &F57FF047 ; DSB NSH
@@ -656,10 +656,10 @@ C15 CN 15
         DCI &F57FF05E ; DMB ST
        |
        [ "$option"="ISH"
-        DCI &F57FF05D ; DMB ISH
+        DCI &F57FF05B ; DMB ISH
        |
        [ "$option"="ISHST"
-        DCI &F57FF05C ; DMB ISHST
+        DCI &F57FF05A ; DMB ISHST
        |
        [ "$option"="NSH"
         DCI &F57FF057 ; DMB NSH
diff --git a/hdr/KernelWS b/hdr/KernelWS
index b7a33c7..f32be4f 100644
--- a/hdr/KernelWS
+++ b/hdr/KernelWS
@@ -1323,8 +1323,8 @@ MMUControlSoftCopy #    4               ; Soft copy of ARM control register
 DeviceCount     #       4       ; size of our table of devices in the system heap
 DeviceTable     #       4       ; pointer to table
 Cache_Lx_Info   #       4       ; Cache level ID register
-Cache_Lx_DTable #       4*8     ; Data/unified cache layout for all 8 levels
-Cache_Lx_ITable #       4*8     ; Instruction cache layout for all 8 levels
+Cache_Lx_DTable #       4*7     ; Data/unified cache layout for all 7 levels
+Cache_Lx_ITable #       4*7     ; Instruction cache layout for all 7 levels
  ]
 
 AplWorkSize * AppSpaceDANode + DANode_Size
diff --git a/s/ARMops b/s/ARMops
index 46dea8f..426a5de 100644
--- a/s/ARMops
+++ b/s/ARMops
@@ -529,29 +529,42 @@ Analyse_WB_CR7_Lx
         MRC     p15, 1, a1, c0, c0, 1 ; Cache level ID register
         MOV     v2, v6 ; Work around DTable/ITable alignment issues
         STR     a1, [v2, #Cache_Lx_Info]!
-        ADD     a1, v2, #Cache_Lx_DTable-Cache_Lx_Info
-        ADD     a2, v2, #Cache_Lx_ITable-Cache_Lx_Info
+        ADD     a2, v2, #Cache_Lx_DTable-Cache_Lx_Info
         MOV     a3, #0
         MOV     a4, #256 ; Smallest instruction cache line length
         MOV     v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length)
 10
-        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
-        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (data/unified)
-        STR     v1, [a1],#4
-        CMP     v1, #0 ; Does the cache exist?
+        ANDS    v1, a1, #6 ; Data or unified cache at this level?
+        MCRNE   p15, 2, a3, c0, c0, 0 ; Program cache size selection register
+        myISB   ,v1
+        MRCNE   p15, 1, v1, c0, c0, 0 ; Get size info (data/unified)
+        STR     v1, [a2]
         AND     v1, v1, #7 ; Get line size
-        CMPNE   v1, v2
-        MOVLT   v2, v1 ; Earlier CMP will not set LE flags if v1=0
+        CMP     v1, v2
+        MOVLT   v2, v1
         ADD     a3, a3, #1
-        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
-        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (instruction)
-        STR     v1, [a2],#4
-        CMP     v1, #0 ; Does the cache exist?
+        ANDS    v1, a1, #1 ; Instruction cache at this level?
+        MCRNE   p15, 2, a3, c0, c0, 0 ; Program cache size selection register
+        myISB   ,v1
+        MRCNE   p15, 1, v1, c0, c0, 0 ; Get size info (instruction)
+        STR     v1, [a2, #Cache_Lx_ITable-Cache_Lx_DTable]
         AND     v1, v1, #7 ; Get line size
-        CMPNE   v1, a4
-        MOVLT   a4, v1 ; Earlier CMP will not set LE flags if v1=0
+        CMP     v1, a4
+        MOVLT   a4, v1
+        ; Shift the cache level ID register along to get the type of the next
+        ; cache level
+        ; However, we need to stop once we reach the first blank entry, because
+        ; ARM have been sneaky and started to reuse some of the bits from the
+        ; high end of the register (the Cortex-A8 TRM lists bits 21-23 as being
+        ; for cache level 8, but the ARMv7 ARM lists them as being for the level
+        ; of unification for inner shareable memory). The ARMv7 ARM does warn
+        ; about making sure you stop once you find the first blank entry, but
+        ; it doesn't say why!
+        TST     a1, #7
         ADD     a3, a3, #1
-        CMP     a3, #16
+        MOVNE   a1, a1, LSR #3
+        CMP     a3, #14 ; Stop after level 7 (even though an 8th level might exist on some CPUs?)
+        ADD     a2, a2, #4
         BLT     %BT10
         STRB    a4, [v6, #ICache_LineLen] ; Store log2(line size)-2
         STRB    v2, [v6, #DCache_LineLen] ; log2(line size)-2
@@ -1878,150 +1891,122 @@ MMU_ChangingUncachedEntries_WB_Cal_LD ROUT
 ; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length
 ; DCache_RangeThreshold = clean threshold for data cache
 ; Cache_Lx_Info = Cache level ID register
-; Cache_Lx_DTable = Cache size identification register for all 8 data/unified caches
-; Cache_Lx_ITable = Cache size identification register for all 8 instruction caches
+; Cache_Lx_DTable = Cache size identification register for all 7 data/unified caches
+; Cache_Lx_ITable = Cache size identification register for all 7 instruction caches
 
-Cache_CleanAll_WB_CR7_Lx ROUT
-; Clean cache by traversing all sets and ways for all data caches
-        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
+; ARMv7 cache maintenance routines are a bit long-winded, so we use this macro
+; to reduce the risk of mistakes creeping in due to code duplication
+;
+; $op: Operation to perform ('clean', 'invalidate', 'cleaninvalidate')
+; $levels: Which levels to apply to ('lou', 'loc', 'louis')
+; Uses r0-r8 & lr as temp
+; Performs the indicated op on the indicated data & unified caches
+;
+; Code based around the alternate/faster code given in the ARMv7 ARM (section
+; B2.2.4, alternate/faster code only in doc revision 9), but tightened up a bit
+;
+; Note that HAL_InvalidateCache_ARMvF uses its own implementation of this
+; algorithm, since it must cope with different temporary registers and it needs
+; to read the cache info straight from the CP15 registers
+;
+        MACRO
+        MaintainDataCache_WB_CR7_Lx $op, $levels
         LDR     lr, =ZeroPage
-        LDR     a1, [lr, #Cache_Lx_Info]!
+        LDR     r0, [lr, #Cache_Lx_Info]!
         ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
-        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
-        MOV     a2, #0 ; Current cache level
-20
-        TST     a1, #7 ; Get flags
-        BEQ     %FT10 ; Cache clean complete
-        LDR     a3, [lr], #4 ; Get size info
-        AND     v1, a3, #&7 ; log2(Line size)-2
-        BIC     a3, a3, #&F0000007 ; Clear flags & line size
-        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
-        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
-        ; Way number needs to be packed right up at the high end of the data word; shift it up
-        CLZ     a4, v2
-        MOV     v2, v2, LSL a4
-        ; Set number needs to start at log2(Line size)+2
-        MOV     v3, v3, LSL #4 ; Start at bit 4
-        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
-        ; Now calculate the offset numbers we will use to increment sets & ways
-        BIC     v4, v2, v2, LSL #1 ; Way increment
-        BIC     v5, v3, v3, LSL #1 ; Set increment
-        ; Now we can finally clean this cache!
-        ORR     a3, a2, v3 ; Current way (0), set (max), and level
-30
-        MCR     p15, 0, a3, c7, c10, 2 ; Clean
-        ADDS    a3, a3, v4 ; Increment way
-        BCC     %BT30 ; Overflow will occur once ways are enumerated
-        TST     a3, v3 ; Are set bits all zero?
-        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
-        BNE     %BT30
-        ; This cache is now clean. Move on to the next level.
-        ADD     a2, a2, #2
-        MOVS    a1, a1, LSR #3
-        BNE     %BT20
-10
-        myDSB   ,a1 ; Wait for cache cleaning to complete
-        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"
+      [ "$levels"="lou"
+        ANDS    r3, r0, #&38000000
+        MOV     r3, r3, LSR #26 ; Cache level value (naturally aligned)
+      |
+      [ "$levels"="loc"
+        ANDS    r3, r0, #&07000000
+        MOV     r3, r3, LSR #23 ; Cache level value (naturally aligned)
+      |
+      [ "$levels"="louis"
+        ANDS    r3, r0, #&00E00000
+        MOV     r3, r3, LSR #20 ; Cache level value (naturally aligned)
+      |
+        ! 1, "Unrecognised levels"
+      ]
+      ]
+      ]
+        BEQ     %FT50
+        MOV     r8, #0 ; Current cache level
+10 ; Loop1
+        ADD     r2, r8, r8, LSR #1 ; Work out 3 x cachelevel
+        MOV     r1, r0, LSR r2 ; bottom 3 bits are the Cache type for this level
+        AND     r1, r1, #7 ; get those 3 bits alone
+        CMP     r1, #2
+        BLT     %FT40 ; no cache or only instruction cache at this level
+        LDR     r1, [lr, r8, LSL #1] ; read CCSIDR to r1
+        AND     r2, r1, #&7 ; extract the line length field
+        ADD     r2, r2, #4 ; add 4 for the line length offset (log2 16 bytes)
+        LDR     r7, =&3FF
+        AND     r7, r7, r1, LSR #3 ; r7 is the max number on the way size (right aligned)
+        CLZ     r5, r7 ; r5 is the bit position of the way size increment
+        LDR     r4, =&7FFF
+        AND     r4, r4, r1, LSR #13 ; r4 is the max number of the index size (right aligned)
+20 ; Loop2
+        MOV     r1, r4 ; r1 working copy of the max index size (right aligned)
+30 ; Loop3
+        ORR     r6, r8, r7, LSL r5 ; factor in the way number and cache number into r6
+        ORR     r6, r6, r1, LSL r2 ; factor in the index number
+      [ "$op"="clean"
+        MCR     p15, 0, r6, c7, c10, 2 ; Clean
+      |
+      [ "$op"="invalidate"
+        MCR     p15, 0, r6, c7, c6, 2 ; Invalidate
+      |
+      [ "$op"="cleaninvalidate"
+        MCR     p15, 0, r6, c7, c14, 2 ; Clean & invalidate
+      |
+        ! 1, "Unrecognised op"
+      ]
+      ]
+      ]
+        SUBS    r1, r1, #1 ; decrement the index
+        BGE     %BT30
+        SUBS    r7, r7, #1 ; decrement the way number
+        BGE     %BT20
+40 ; Skip
+        ADD     r8, r8, #2
+        CMP     r3, r8
+        BGT     %BT10
+        myDSB   ,r0
+50 ; Finished
+        MEND
+
+Cache_CleanAll_WB_CR7_Lx ROUT
+; Clean cache by traversing all sets and ways for all data caches
+        Push    "r1-r8,lr"
+        MaintainDataCache_WB_CR7_Lx clean, loc
+        Pull    "r1-r8,pc"
 
 
 Cache_CleanInvalidateAll_WB_CR7_Lx ROUT
 ;
 ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
 ;
-        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
-        LDR     lr, =ZeroPage
-        LDR     a1, [lr, #Cache_Lx_Info]!
-        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
-        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
-        MOV     a2, #0 ; Current cache level
-20
-        TST     a1, #7 ; Get flags
-        BEQ     %FT10 ; Cache clean complete
-        LDR     a3, [lr], #4 ; Get size info
-        AND     v1, a3, #&7 ; log2(Line size)-2
-        BIC     a3, a3, #&F0000007 ; Clear flags & line size
-        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
-        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
-        ; Way number needs to be packed right up at the high end of the data word; shift it up
-        CLZ     a4, v2
-        MOV     v2, v2, LSL a4
-        ; Set number needs to start at log2(Line size)+2
-        MOV     v3, v3, LSL #4 ; Start at bit 4
-        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
-        ; Now calculate the offset numbers we will use to increment sets & ways
-        BIC     v4, v2, v2, LSL #1 ; Way increment
-        BIC     v5, v3, v3, LSL #1 ; Set increment
-        ; Now we can finally clean this cache!
-        ORR     a3, a2, v3 ; Current way (0), set (max), and level
-30
-        MCR     p15, 0, a3, c7, c14, 2 ; Clean & invalidate
-        ADDS    a3, a3, v4 ; Increment way
-        BCC     %BT30 ; Overflow will occur once ways are enumerated
-        TST     a3, v3 ; Are set bits all zero?
-        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
-        BNE     %BT30
-        ; This cache is now clean. Move on to the next level.
-        ADD     a2, a2, #2
-        MOVS    a1, a1, LSR #3
-        BNE     %BT20
-10
-        MOV     a1, #0
-        myDSB   ,a1,,y                ; Wait for cache clean to complete
+        Push    "r1-r8,lr"
+        MaintainDataCache_WB_CR7_Lx cleaninvalidate, loc
         MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
         MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
         myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
         myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
-        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"
+        Pull    "r1-r8,pc"
 
 
 Cache_InvalidateAll_WB_CR7_Lx ROUT
 ;
 ; no clean, assume caller knows what's happening
 ;
-        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
-        LDR     lr, =ZeroPage
-        LDR     a1, [lr, #Cache_Lx_Info]!
-        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
-        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
-        MOV     a2, #0 ; Current cache level
-20
-        TST     a1, #7 ; Get flags
-        BEQ     %FT10 ; Cache clean complete
-        LDR     a3, [lr], #4 ; Get size info
-        AND     v1, a3, #&7 ; log2(Line size)-2
-        BIC     a3, a3, #&F0000007 ; Clear flags & line size
-        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
-        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
-        ; Way number needs to be packed right up at the high end of the data word; shift it up
-        CLZ     a4, v2
-        MOV     v2, v2, LSL a4
-        ; Set number needs to start at log2(Line size)+2
-        MOV     v3, v3, LSL #4 ; Start at bit 4
-        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
-        ; Now calculate the offset numbers we will use to increment sets & ways
-        BIC     v4, v2, v2, LSL #1 ; Way increment
-        BIC     v5, v3, v3, LSL #1 ; Set increment
-        ; Now we can finally clean this cache!
-        ORR     a3, a2, v3 ; Current way (0), set (max), and level
-30
-        MCR     p15, 0, a3, c7, c6, 2 ; Invalidate
-        ADDS    a3, a3, v4 ; Increment way
-        BCC     %BT30 ; Overflow will occur once ways are enumerated
-        TST     a3, v3 ; Are set bits all zero?
-        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
-        BNE     %BT30
-        ; This cache is now clean. Move on to the next level.
-        ADD     a2, a2, #2
-        MOVS    a1, a1, LSR #3
-        BNE     %BT20
-10
-        MOV     a1, #0
-        myDSB   ,a1,,y                ; Wait for invalidation to complete
+        Push    "r1-r8,lr"
+        MaintainDataCache_WB_CR7_Lx cleaninvalidate, loc
         MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
         MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
         myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
         myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
-        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"
+        Pull    "r1-r8,pc"
 
 
 Cache_RangeThreshold_WB_CR7_Lx ROUT
@@ -2073,51 +2058,13 @@ IMB_Full_WB_CR7_Lx ROUT
 ; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor
 ; Luckily, we only need to clean as far as the level of unification
 ;
-        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
-        LDR     lr, =ZeroPage
-        LDR     a1, [lr, #Cache_Lx_Info]!
-        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
-        MOV     a1, a1, LSR #27
-        AND     a1, a1, #&7 ; Get level of unification
-        MOV     a2, #0 ; Current cache level
-        SUBS    a1, a1, #1
-        BLT     %FT10 ; Cache clean complete
-20
-        LDR     a3, [lr], #4 ; Get size info
-        AND     v1, a3, #&7 ; log2(Line size)-2
-        BIC     a3, a3, #&F0000007 ; Clear flags & line size
-        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
-        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
-        ; Way number needs to be packed right up at the high end of the data word; shift it up
-        CLZ     a4, v2
-        MOV     v2, v2, LSL a4
-        ; Set number needs to start at log2(Line size)+2
-        MOV     v3, v3, LSL #4 ; Start at bit 4
-        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
-        ; Now calculate the offset numbers we will use to increment sets & ways
-        BIC     v4, v2, v2, LSL #1 ; Way increment
-        BIC     v5, v3, v3, LSL #1 ; Set increment
-        ; Now we can finally clean this cache!
-        ORR     a3, a2, v3 ; Current way (0), set (max), and level
-30
-        MCR     p15, 0, a3, c7, c10, 2 ; Clean
-        ADDS    a3, a3, v4 ; Increment way
-        BCC     %BT30 ; Overflow will occur once ways are enumerated
-        TST     a3, v3 ; Are set bits all zero?
-        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
-        BNE     %BT30
-        ; This cache is now clean. Move on to the next level.
-        ADD     a2, a2, #2
-        SUBS    a1, a1, #1
-        BGE     %BT20
-10
-        MOV     a1, #0
-        myDSB   ,a1,,y                ; Wait for clean to complete
+        Push    "r1-r8,lr"
+        MaintainDataCache_WB_CR7_Lx clean, lou
         MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
         MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
         myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
         myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
-        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"
+        Pull    "r1-r8,pc"
 
 ;  a1 = start address (inclusive, cache line aligned)
 ;  a2 = end address (exclusive, cache line aligned)
@@ -2160,7 +2107,7 @@ MMU_Changing_WB_CR7_Lx ROUT
         BL      Cache_CleanInvalidateAll_WB_CR7_Lx
         MOV     a1, #0
         MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
-        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
+        myDSB   ,a1,,y                ; Wait for TLB invalidation to complete
         myISB   ,a1,,y                ; Ensure that the effects are visible
         Pull    "pc"
 
diff --git a/s/HAL b/s/HAL
index f0d8637..2a882db 100644
--- a/s/HAL
+++ b/s/HAL
@@ -1074,54 +1074,56 @@ HAL_InvalidateCache_ARMvF
         ; The only register we can safely change is ip, but we can switch into FIQ mode with interrupts disabled and use the banked registers there
         MRS     ip, CPSR
         MSR     CPSR_c, #F32_bit+I32_bit+FIQ32_mode
-        MOV     r8, #0
-        MCR     p15, 0, r8, c7, c5, 0           ; invalidate instruction cache
-        MCR     p15, 0, r8, c8, c7, 0           ; invalidate TLBs
-        MCR     p15, 0, r8, c7, c5, 6           ; invalidate branch target predictor
-        myDSB   ,r8,,y                          ; Wait for completion
-        myISB   ,r8,,y
+        MOV     r9, #0
+        MCR     p15, 0, r9, c7, c5, 0           ; invalidate instruction cache
+        MCR     p15, 0, r9, c8, c7, 0           ; invalidate TLBs
+        MCR     p15, 0, r9, c7, c5, 6           ; invalidate branch target predictor
+        myDSB   ,r9,,y                          ; Wait for completion
+        myISB   ,r9,,y
         ; Check whether we're ARMv7 (and thus multi-level cache) or ARMv6 (and thus single-level cache)
-        MRC     p15, 0, r9, c0, c0, 1
-        TST     r9, #&80000000 ; EQ=ARMv6, NE=ARMv7
-        MCREQ   ARM_config_cp,0,r8,ARMv4_cache_reg,C7 ; ARMv3-ARMv6 I+D cache flush
-        BEQ     %FT10 ; Skip to the end
-
-        MRC     p15, 1, r8, c0, c0, 1 ; Cache level ID register
-        BIC     r8, r8, #&FF000000 ; Discard unification/coherency bits
-        MOV     r9, #0 ; Current cache level
-20
-        TST     r8, #7 ; Get flags
-        BEQ     %FT10 ; Cache clean complete
-        MCR     p15, 2, r9, c0, c0, 0 ; Program cache size selection register
-        myISB   ,r8,,y
-        MRC     p15, 1, r10, c0, c0, 0 ; Get size info
-        AND     r11, r10, #&7 ; log2(Line size)-2
-        BIC     r10, r10, #&F0000007 ; Clear flags & line size
-        MOV     r12, r10, LSL #19 ; Number of ways-1 in upper 10 bits
-        MOV     r10, r10, LSR #13 ; Number of sets-1 in lower 15 bits
-        ; Way number needs to be packed right up at the high end of the data word; shift it up
-        CLZ     r14, r12
-        MOV     r12, r12, LSL r14
-        ; Set number needs to start at log2(Line size)+2
-        MOV     r10, r10, LSL #4 ; Start at bit 4
-        MOV     r10, r10, LSL r11 ; Start at log2(Line size)+2
-        ; Now calculate the offset numbers we will use to increment sets & ways
-        BIC     r12, r12, r12, LSL #1 ; Way increment
-        BIC     r11, r10, r10, LSL #1 ; Set increment
-        ; Now we can finally clean this cache!
-        ORR     r14, r9, r10 ; Current way (0), set (max), and level
-30
+        MRC     p15, 0, r8, c0, c0, 1
+        TST     r8, #&80000000 ; EQ=ARMv6, NE=ARMv7
+        MCREQ   ARM_config_cp,0,r9,ARMv4_cache_reg,C7 ; ARMv3-ARMv6 I+D cache flush
+        BEQ     %FT50 ; Skip to the end
+
+        ; This is basically the same algorithm as the MaintainDataCache_WB_CR7_Lx macro, but tweaked to use less registers and to read from CP15 directly
+        TST     r8, #&07000000
+        BEQ     %FT50
+        MOV     r11, #0 ; Current cache level
+10 ; Loop1
+        ADD     r10, r11, r11, LSR #1 ; Work out 3 x cachelevel
+        MOV     r9, r8, LSR r10 ; bottom 3 bits are the Cache type for this level
+        AND     r9, r9, #7 ; get those 3 bits alone
+        CMP     r9, #2
+        BLT     %FT40 ; no cache or only instruction cache at this level
+        MCR     p15, 2, r11, c0, c0, 0 ; write CSSELR from r11
+        myISB   ,r9
+        MRC     p15, 1, r9, c0, c0, 0 ; read current CSSIDR to r9
+        AND     r10, r9, #&7 ; extract the line length field
+        ADD     r10, r10, #4 ; add 4 for the line length offset (log2 16 bytes)
+        LDR     r8, =&3FF
+        AND     r8, r8, r9, LSR #3 ; r8 is the max number on the way size (right aligned)
+        CLZ     r13, r8 ; r13 is the bit position of the way size increment
+        LDR     r12, =&7FFF
+        AND     r12, r12, r9, LSR #13 ; r12 is the max number of the index size (right aligned)
+20 ; Loop2
+        MOV     r9, r12 ; r9 working copy of the max index size (right aligned)
+30 ; Loop3
+        ORR     r14, r11, r8, LSL r13 ; factor in the way number and cache number into r14
+        ORR     r14, r14, r9, LSL r10 ; factor in the index number
         MCR     p15, 0, r14, c7, c6, 2 ; Invalidate
-        ADDS    r14, r14, r12 ; Increment way
-        BCC     %BT30 ; Overflow will occur once ways are enumerated
-        TST     r14, r10 ; Are set bits all zero?
-        SUBNE   r14, r14, r11 ; No, so decrement set and loop around again
-        BNE     %BT30
-        ; This cache is now clean. Move on to the next level.
-        ADD     r9, r9, #2
-        MOVS    r8, r8, LSR #3
-        BNE     %BT20
-10
+        SUBS    r9, r9, #1 ; decrement the index
+        BGE     %BT30
+        SUBS    r8, r8, #1 ; decrement the way number
+        BGE     %BT20
+        MRC     p15, 0, r8, c0, c0, 1
+40 ; Skip
+        ADD     r11, r11, #2
+        AND     r14, r8, #&07000000
+        CMP     r14, r11, LSL #23
+        BGT     %BT10
+
+50 ; Finished
         ; Wait for clean to complete
         MOV     r8, #0
         myDSB   ,r8,,y
diff --git a/s/NewReset b/s/NewReset
index d8baa5d..f6cc34f 100644
--- a/s/NewReset
+++ b/s/NewReset
@@ -687,15 +687,13 @@ kbdwait
         SUBS    r6, r6, #1              ; else wait a maximum of 5 seconds.
         BNE     kbdwait
 kbddone
-        DebugTX "Keyboard scan complete"
         MSR     CPSR_c, #I32_bit+SVC32_mode
-        DebugTX "FIQ enabled"
         CallHAL HAL_KbdScanFinish
         LDR     r1, =ZeroPage+InitIRQWs
         MOV     r0, #0
         STRB    r0, [r1, #KbdScanActive]
         MSR     CPSR_c, #SVC32_mode
-        DebugTX "IRQ enabled"
+        DebugTX "Keyboard scan complete"
  |
     [ KeyWait <> 0
 ; Check for keyboard there every 1/5 sec. but give up after 2 secs.
-- 
GitLab