; Copyright 2000 Pace Micro Technology plc
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
  ;      GET     Hdr:ListOpts
  ;      GET     Hdr:Macros
  ;      GET     Hdr:System
  ;      $GetCPU
  ;      $GetMEMM

  ;      GET     hdr.Options

  ;      GET     Hdr:PublicWS
  ;      GET     Hdr:KernelWS

  ;      GET     hdr.Copro15ops
  ;      GET     hdr.ARMops

v7      RN      10

  ;      EXPORT  Init_ARMarch
  ;      EXPORT  ARM_Analyse
  ;      EXPORT  ARM_PrintProcessorType

 ;       AREA    KernelCode,CODE,READONLY

; ARM keep changing their mind about ID field layout.
; Here's a summary, courtesy of the ARM ARM (v5):
;
; pre-ARM 7:   xxxx0xxx
; ARM 7:       xxxx7xxx where bit 23 indicates v4T/~v3
; post-ARM 7:  xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T)
;

; int Init_ARMarch(void)
; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later.
; Corrupts only ip, no RAM usage.
Init_ARMarch
        ARM_read_ID ip
        ANDS    a1, ip, #&0000F000
        MOVEQ   pc, lr                          ; ARM 3 or ARM 6
        TEQ     a1, #&00007000
        BNE     %FT20
        TST     ip, #&00800000                  ; ARM 7 - check for Thumb
        MOVNE   a1, #ARMv4T
        MOVEQ   a1, #ARMv3
        MOV     pc, lr
20      ANDS    a1, ip, #&000F0000              ; post-ARM 7
        MOV     a1, a1, LSR #16
        MOV     pc, lr



ARM_Analyse
        MOV     a2, lr
        BL      Init_ARMarch
        MOV     lr, a2
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        BEQ     ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup
 ]
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        ARM_read_cachetype v2
        LDR     v6, =ZeroPage

        ADRL    v7, KnownCPUTable
FindARMloop
        LDMIA   v7!, {a1, a2}                   ; See if it's a known ARM
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     FindARMloop
        TEQ     v2, v1                          ; If we don't have cache attributes, read from table
        LDREQ   v2, [v7]

20      TEQ     v2, v1
        BEQ     %BT20                           ; Cache unknown: panic

        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        ASSERT  CT_Isize_pos = 0
        MOV     a1, v2
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        [ No26bitCode
        MOV     v5, #CPUFlag_32bitOS
        |
        MOV     v5, #0
        ]
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        TST     v2, #CT_S
        ORRNE   v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas

        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        ; Test abort timing (base restored or base updated)
        MOV     a1, #&8000
        LDR     a2, [a1], #4                    ; Will abort - DAb handler will continue execution
        TEQ     a1, #&8000
        ORREQ   v5, v5, #CPUFlag_BaseRestored

        ; Check store of PC
30      STR     pc, [sp, #-4]!
        ADR     a2, %BT30 + 8
        LDR     a1, [sp], #4
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_StorePCplus8

        [ 0=1
        ; Check whether 26-bit mode is available
        MSR     CPSR_c, #F32_bit+I32_bit+SVC26_mode
        MRS     a1, CPSR
        AND     a1, a1, #M32_bits
        TEQ     a1, #SVC26_mode
        ORRNE   v5, v5, #CPUFlag_No26bitMode
        MSREQ   CPSR_c, #F32_bit+I32_bit+SVC32_mode
        BNE     %FT35

        ; Do we get vector exceptions on read?
        LDR     a2, =ZeroPage
        MOV     a1, a2
        LDR     a1, [a1]                        ; If this aborts a1 will be left unchanged
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_VectorReadException
        ]
35

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        TEQ     a1, #ARMv3                      ; assume long multiply available
        ORRNE   v5, v5, #CPUFlag_LongMul        ; if v4 or later
        TEQNE   a1, #ARMv4                      ; assume 26-bit available
        ORRNE   v5, v5, #CPUFlag_No26bitMode    ; iff v3 or v4 (not T)
        TEQNE   a1, #ARMv5                      ; assume Thumb available
        ORRNE   v5, v5, #CPUFlag_Thumb          ; iff not v3,v4,v5

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

 [ XScaleJTAGDebug
        TST     v5, #CPUFlag_XScale
        BEQ     %FT40

        MRC     p14, 0, a2, c10, c0             ; Read debug control register
        TST     a2, #&80000000
        ORRNE   v5, v5, #CPUFlag_XScaleJTAGconnected
        MOVEQ   a2, #&C000001C                  ; enable hot debug
        MCREQ   p14, 0, a2, c10, c0
        BNE     %FT40
40
 ]

        STR     v5, [v6, #ProcessorFlags]

        ; Now, a1 = processor architecture (ARMv3, ARMv4 ...)
        ;      v4 = processor type (ARM600, ARM610, ...)
        ;      v5 = processor flags

        CMP     a1, #ARMv4
        BLO     Analyse_ARMv3                   ; eg. ARM710

        LDRB    a2, [v6, #Cache_Type]
        TEQ     a2, #CT_ctype_WT
        TSTEQ   v5, #CPUFlag_SplitCache
        BEQ     Analyse_WriteThroughUnified     ; eg. ARM7TDMI derivative

        TEQ     a2, #CT_ctype_WB_CR7_LDa
        BEQ     Analyse_WB_CR7_LDa              ; eg. ARM9

        TEQ     a2, #CT_ctype_WB_Crd
        BEQ     Analyse_WB_Crd                  ; eg. StrongARM

        TEQ     a2, #CT_ctype_WB_Cal_LD
        BEQ     Analyse_WB_Cal_LD               ; assume XScale

        ; others ...

WeirdARMPanic
        B       WeirdARMPanic                   ; stiff :)

Analyse_ARMv3
        ADRL    a1, NullOp
        ADRL    a2, Cache_Invalidate_ARMv3
        ADRL    a3, WriteBuffer_Drain_ARMv3
        ADRL    a4, TLB_Invalidate_ARMv3
        ADRL    ip, TLB_InvalidateEntry_ARMv3

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_ARMv3
        ADRL    a2, MMU_ChangingEntry_ARMv3
        ADRL    a3, MMU_ChangingUncached_ARMv3
        ADRL    a4, MMU_ChangingUncachedEntry_ARMv3
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_ARMv3
        ADRL    a2, MMU_ChangingUncachedEntries_ARMv3
        ADRL    a3, Cache_RangeThreshold_ARMv3
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WriteThroughUnified
        ADRL    a1, NullOp
        ADRL    a2, Cache_InvalidateUnified
        TST     v5, #CPUFlag_NoWBDrain
        ADRNEL  a3, WriteBuffer_Drain_OffOn
        ADREQL  a3, WriteBuffer_Drain
        ADRL    a4, TLB_Invalidate_Unified
        ADRL    ip, TLB_InvalidateEntry_Unified

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_Writethrough
        ADRL    a2, MMU_ChangingEntry_Writethrough
        ADRL    a3, MMU_ChangingUncached
        ADRL    a4, MMU_ChangingUncachedEntry
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_Writethrough
        ADRL    a2, MMU_ChangingUncachedEntries
        ADRL    a3, Cache_RangeThreshold_Writethrough
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WB_CR7_LDa
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here (eg. ARM920)

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_LDa
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDRB    a2, [v6, #DCache_Associativity]

        MOV     a3, #256
        MOV     a4, #8           ; to find log2(ASSOC), rounded up
Analyse_WB_CR7_LDa_L1
        MOV     a3, a3, LSR #1
        SUB     a4, a4, #1
        CMP     a2, a3
        BLO     Analyse_WB_CR7_LDa_L1
        ADDHI   a4, a4, #1

        RSB     a2, a4, #32
        MOV     a3, #1
        MOV     a3, a3, LSL a2
        STR     a3, [v6, #DCache_IndexBit]
        LDR     a4, [v6, #DCache_NSets]
        LDRB    a2, [v6, #DCache_LineLen]
        SUB     a4, a4, #1
        MUL     a4, a2, a4
        STR     a4, [v6, #DCache_IndexSegStart]

        MOV     a2, #64*1024                         ; arbitrary-ish
        STR     a2, [v6, #DCache_RangeThreshold]

        ADRL    a1, XCBTableWBR                      ; assume read-allocate WB/WT cache
        STR     a1, [v6, #MMU_PCBTrans]

        B       %FT90

Analyse_WB_Crd
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Crd
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Crd
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Crd
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Crd
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_Crd
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]
        MOV     a2, #64*1024                       ;arbitrary-ish threshold
        STR     a2, [v6, #DCache_RangeThreshold]

        LDRB    a2, [v6, #ProcessorType]
        TEQ     a2, #SA110
        ADREQL  a2, XCBTableSA110
        BEQ     Analyse_WB_Crd_finish
        TEQ     a2, #SA1100
        TEQNE   a2, #SA1110
        ADREQL  a2, XCBTableSA1110
        ADRNEL  a2, XCBTableWBR
Analyse_WB_Crd_finish
        STR     a2, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WB_Cal_LD
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Cal_LD
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]

  [ XScaleMiniCache
        !       1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable"
        LDR     a2, =DCacheCleanAddress + 4 * 32*1024
        STR     a2, [v6, #MCache_CleanBaseAddress]
        STR     a2, [v6, #MCache_CleanNextAddress]
  ]


  ; arbitrary-ish values, mini cache makes global op significantly more expensive
  [ XScaleMiniCache
        MOV     a2, #128*1024
  |
        MOV     a2, #32*1024
  ]
        STR     a2, [v6, #DCache_RangeThreshold]

        ; enable full coprocessor access
        LDR     a2, =&3FFF
        MCR     p15, 0, a2, c15, c1

        ADRL    a2, XCBTableXScaleWA ; choose between RA and WA here
        STR     a2, [v6, #MMU_PCBTrans]

        B       %FT90

 [ MEMM_Type = "VMSAv6"
Analyse_WB_CR7_Lx
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here

        ; Read the cache info into Cache_Lx_*
        MRC     p15, 1, a1, c0, c0, 1 ; Cache level ID register
        MOV     v2, v6 ; Work around DTable/ITable alignment issues
        STR     a1, [v2, #Cache_Lx_Info]!
        ADD     a1, v2, #Cache_Lx_DTable-Cache_Lx_Info
        ADD     a2, v2, #Cache_Lx_ITable-Cache_Lx_Info
        MOV     a3, #0
        MOV     a4, #256 ; Smallest instruction cache line length
        MOV     v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length)
10
        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (data/unified)
        STR     v1, [a1],#4
        CMP     v1, #0 ; Does the cache exist?
        AND     v1, v1, #7 ; Get line size
        CMPNE   v1, v2
        MOVLT   v2, v1 ; Earlier CMP will not set LE flags if v1=0
        ADD     a3, a3, #1
        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (instruction)
        STR     v1, [a2],#4
        CMP     v1, #0 ; Does the cache exist?
        AND     v1, v1, #7 ; Get line size
        CMPNE   v1, a4
        MOVLT   a4, v1 ; Earlier CMP will not set LE flags if v1=0
        ADD     a3, a3, #1
        CMP     a3, #16
        BLT     %BT10
        STRB    a4, [v6, #ICache_LineLen] ; Store log2(line size)-2
        STRB    v2, [v6, #DCache_LineLen] ; log2(line size)-2

        ; Calculate DCache_RangeThreshold
        MOV     a1, #128*1024 ; Arbitrary-ish
        STR     a1, [v6, #DCache_RangeThreshold]

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_Lx
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        ADRL    a1, XCBTableWBR                      ; assume read-allocate WB/WT cache
        STR     a1, [v6, #MMU_PCBTrans]

        ; Enable L2 cache. This could probably be moved earlier on in the boot sequence (e.g. when the MMU is turned on), but for now it will go here to reduce the chances of stuff breaking
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx ; Ensure L2 cache is clean

        MRC     p15, 0, a1, c1, c0, 1
        ORR     a1, a1, #2 ; L2EN
        MCR     p15, 0, a1, c1, c0, 1

        B       %FT90
 ] ; MEMM_Type = "VMSAv6"

90
        Pull    "v1,v2,v5,v6,v7,pc"


; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined in section
; B2.3.3 of the ARMv5 ARM.
EvaluateCache
        AND     a3, a1, #CT_assoc_mask+CT_M
        TEQ     a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M
        BEQ     %FT80
        MOV     ip, #1
        ASSERT  CT_len_pos = 0
        AND     a4, a1, #CT_len_mask
        ADD     a4, a4, #3
        MOV     a4, ip, LSL a4                  ; LineLen = 1 << (len+3)
        STRB    a4, [a2, #ICache_LineLen-ICache_Info]
        MOV     a3, #2
        TST     a1, #CT_M
        ADDNE   a3, a3, #1                      ; Multiplier = 2 + M
        AND     a4, a1, #CT_assoc_mask
        RSB     a4, ip, a4, LSR #CT_assoc_pos
        MOV     a4, a3, LSL a4                  ; Associativity = Multiplier << (assoc-1)
        STRB    a4, [a2, #ICache_Associativity-ICache_Info]
        AND     a4, a1, #CT_size_mask
        MOV     a4, a4, LSR #CT_size_pos
        MOV     a3, a3, LSL a4
        MOV     a3, a3, LSL #8                  ; Size = Multiplier << (size+8)
        STR     a3, [a2, #ICache_Size-ICache_Info]
        ADD     a4, a4, #6
        AND     a3, a1, #CT_assoc_mask
        SUB     a4, a4, a3, LSR #CT_assoc_pos
        AND     a3, a1, #CT_len_mask
        ASSERT  CT_len_pos = 0
        SUB     a4, a4, a3
        MOV     a4, ip, LSL a4                  ; NSets = 1 << (size + 6 - assoc - len)
        STR     a4, [a2, #ICache_NSets-ICache_Info]
        MOV     pc, lr


80      MOV     a1, #0
        STR     a1, [a2, #ICache_NSets-ICache_Info]
        STR     a1, [a2, #ICache_Size-ICache_Info]
        STRB    a1, [a2, #ICache_LineLen-ICache_Info]
        STRB    a1, [a2, #ICache_Associativity-ICache_Info]
        MOV     pc, lr


; Create a list of CPUs, 16 bytes per entry:
;    ID bits (1 word)
;    Test mask for ID (1 word)
;    Cache type register value (1 word)
;    Processor type (1 byte)
;    Architecture type (1 byte)
;    Reserved (2 bytes)
        GBLA    tempcpu

        MACRO
        CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln
        LCLA    type
type    SETA    (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos)
tempcpu CSzDesc $dsz, $das, $dln
type    SETA    type+(tempcpu:SHL:CT_Dsize_pos)
        [ :LNOT:($s=0 :LAND: "$isz"="")
tempcpu CSzDesc $isz, $ias, $iln
        ]
type    SETA    type+(tempcpu:SHL:CT_Isize_pos)
        ASSERT  ($id :AND: :NOT: $mask) = 0
        DCD     $id, $mask, type
        DCB     $proc, $arch, 0, 0
        MEND

        MACRO
$var    CSzDesc $sz, $as, $ln
$var    SETA    (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos)
$var    SETA    $var+(CT_M_$sz:SHL:CT_M_pos)
        MEND


; CPUDesc table for ARMv3-ARMv6
KnownCPUTable
;                                                        /------Cache Type register fields-----\�
;                              ID reg   Mask     Arch    Type         S  Dsz Das Dln Isz Ias Iln
        CPUDesc ARM600,        &000600, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM610,        &000610, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARMunk,        &000000, &00F000, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM700,        &007000, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710,        &007100, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710a,       &047100, &FDFFF0, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM7500,       &067100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARM7500FE,     &077100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARMunk,        &007000, &80F000, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM720T,       &807200, &FFFFF0, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc ARMunk,        &807000, &80F000, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA110,         &01A100, &0FFFF0, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA1100,        &01A110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc SA1110,        &01B110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc ARM920T,       &029200, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8
        CPUDesc ARM922T,       &029220, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1,  8K, 64, 8,  8K, 64, 8
        CPUDesc X80200,        &052000, &0FFFF0, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
        CPUDesc X80321,    &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
        DCD     -1

; Simplified CPUDesc table for ARMvF
; The cache size data is ignored for ARMv7.
KnownCPUTable_Fancy
        CPUDesc Cortex_A8,     &00C080, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32, 16, 16K, 32, 16
        CPUDesc ARM1176JZF_S,  &00B760, &00FFF0, ARMv6, WB_CR7_LDa, 1, 16K, 32, 16,16K, 32, 16
        DCD     -1

; Peculiar characteristics of individual ARMs not deducable otherwise. First field is
; flags to set, second flags to clear.
KnownCPUFlags
        DCD     0,                            0    ; ARM 600
        DCD     0,                            0    ; ARM 610
        DCD     0,                            0    ; ARM 700
        DCD     0,                            0    ; ARM 710
        DCD     0,                            0    ; ARM 710a
        DCD     CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay,   0    ; SA 110 pre revT
        DCD     CPUFlag_InterruptDelay,       0    ; SA 110 revT or later
        DCD     0,                            0    ; ARM 7500
        DCD     0,                            0    ; ARM 7500FE
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1100
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1110
        DCD     CPUFlag_NoWBDrain,            0    ; ARM 720T
        DCD     0,                            0    ; ARM 920T
        DCD     0,                            0    ; ARM 922T
        DCD     CPUFlag_ExtendedPages+CPUFlag_XScale,  0    ; X80200
        DCD     CPUFlag_XScale,               0    ; X80321
        DCD     0,                            0    ; Cortex_A8
        DCD     0,                            0    ; ARM1176JZF_S

 [ MEMM_Type = "VMSAv6"
; --------------------------------------------------------------------------
; ----- ARM_Analyse_Fancy --------------------------------------------------
; --------------------------------------------------------------------------
;
; For ARMv7 ARMs (arch=&F), we can detect everything via the feature registers
; TODO - There's some stuff in here that can be tidied up/removed

; Things we need to set up:
; ProcessorType     (as listed in hdr.ARMops)
; Cache_Type        (CT_ctype_* from hdr:MEMM.ARM600)
; ProcessorArch     (as reported by Init_ARMarch)
; ProcessorFlags    (CPUFlag_* from hdr.ARMops)
; Proc_*            (Cache/TLB/IMB/MMU function pointers)
; MMU_PCBTrans      (Points to lookup table for translating page table cache options)
; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?)

ARM_Analyse_Fancy
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        LDR     v6, =ZeroPage
        ADRL    v7, KnownCPUTable_Fancy
10
        LDMIA   v7!, {a1, a2}
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     %BT10
20
        LDR     v2, [v7]
        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        MOV     v5, #CPUFlag_32bitOS+CPUFlag_No26bitMode ; 26bit has been obsolete for a long time
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        ; Work out whether the cache info is in ARMv6 or ARMv7 style
        MRC     p15, 0, a1, c0, c0, 1
        TST     a1, #&80000000
        BNE     %FT25

        ; ARMv6 format cache type register.
        ; TODO - Use the cache type register to deduce the cache info.
        ; For now, just fall back on the values in the CPU table.
        ASSERT  CT_Isize_pos = 0
        MOV     a1, v2
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache
        B       %FT27

25
        ; ARMv7 format cache type register. This should(!) mean that we have the cache level ID register, and all the other ARMv7 cache registers.

        ; Do we have a split cache?
        MRC     p15, 1, a1, c0, c0, 1
        AND     a2, a1, #7
        TEQ     a2, #3
        ORREQ   v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache

27
        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        ; Test abort timing (base restored or base updated)
        MOV     a1, #&8000
        LDR     a2, [a1], #4                    ; Will abort - DAb handler will continue execution
        TEQ     a1, #&8000
        ORREQ   v5, v5, #CPUFlag_BaseRestored

        ; Check store of PC
30      STR     pc, [sp, #-4]!
        ADR     a2, %BT30 + 8
        LDR     a1, [sp], #4
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_StorePCplus8

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        MRC     p15, 0, a1, c0, c2, 2
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_LongMul

        MRC     p15, 0, a1, c0, c1, 0
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_Thumb

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this?

        ; Other flags not checked for above:
        ; CPUFlag_InterruptDelay
        ; CPUFlag_VectorReadException
        ; CPUFlag_ExtendedPages
        ; CPUFlag_NoWBDrain
        ; CPUFlag_AbortRestartBroken
        ; CPUFlag_XScale
        ; CPUFlag_XScaleJTAGconnected

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

        STR     v5, [v6, #ProcessorFlags]

        ; Cache analysis

        LDRB    a2, [v6, #Cache_Type]
        TEQ     a2, #CT_ctype_WT
        TSTEQ   v5, #CPUFlag_SplitCache
        BEQ     Analyse_WriteThroughUnified     ; eg. ARM7TDMI derivative

        TEQ     a2, #CT_ctype_WB_CR7_LDa
        BEQ     Analyse_WB_CR7_LDa              ; eg. ARM9

        TEQ     a2, #CT_ctype_WB_Crd
        BEQ     Analyse_WB_Crd                  ; eg. StrongARM

        TEQ     a2, #CT_ctype_WB_Cal_LD
        BEQ     Analyse_WB_Cal_LD               ; assume XScale

        TEQ     a2, #CT_ctype_WB_CR7_Lx
        BEQ     Analyse_WB_CR7_Lx

        ; others ...

        B       WeirdARMPanic                   ; stiff :)
 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------
; ----- ARMops -------------------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMops are the routines required by the kernel for cache/MMU control
; the kernel vectors to the appropriate ops for the given ARM at boot
;
; The Rules:
;   - These routines may corrupt a1 and lr only
;   - (lr can of course only be corrupted whilst still returning to correct
;     link address)
;   - stack is available, at least 16 words can be stacked
;   - a NULL op would be a simple MOV pc, lr
;

; --------------------------------------------------------------------------
; ----- ARMops for ARMv3 ---------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMv3 ARMs include ARM710, ARM610, ARM7500
;

Cache_Invalidate_ARMv3
        MCR     p15, 0, a1, c7, c0
NullOp  MOV     pc, lr

WriteBuffer_Drain_ARMv3
        ;swap always forces unbuffered write, stalling till WB empty
        SUB     sp, sp, #4
        SWP     a1, a1, [sp]
        ADD     sp, sp, #4
        MOV     pc, lr

TLB_Invalidate_ARMv3
        MCR     p15, 0, a1, c5, c0
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_ARMv3
        MCR     p15, 0, a1, c6, c0
        MOV     pc, lr

MMU_Changing_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached_ARMv3
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_ARMv3
        Push    "a2"
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_ARMv3
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached_ARMv3
        Push    "a2"
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_ARMv3
        ! 0, "arbitrary Cache_RangeThreshold_ARMv3"
        MOV     a1, #16*PageSize
        MOV     pc, lr

        LTORG

; --------------------------------------------------------------------------
; ----- generic ARMops for simple ARMs, ARMv4 onwards ----------------------
; --------------------------------------------------------------------------
;
; eg. ARM7TDMI based ARMs, unified, writethrough cache
;

Cache_InvalidateUnified
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7
        MOV     pc, lr

WriteBuffer_Drain_OffOn
        ; used if ARM has no drain WBuffer MCR op
        Push    "a2"
        ARM_read_control a1
        BIC     a2, a1, #MMUC_W
        ARM_write_control a2
        ARM_write_control a1
        Pull    "a2"
        MOV     pc, lr

WriteBuffer_Drain
        ; used if ARM has proper drain WBuffer MCR op
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4
        MOV     pc, lr

TLB_Invalidate_Unified
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_Unified
        MCR     p15, 0, a1, c8, c7, 1
        MOV     pc, lr

MMU_Changing_Writethrough
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_Writethrough
        Push    "a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        Pull    "a4"
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_Writethrough  ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_Writethrough
        Push    "a2,a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2,a4"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_Writethrough
        ! 0, "arbitrary Cache_RangeThreshold_Writethrough"
        MOV     a1, #16*PageSize
        MOV     pc, lr

; --------------------------------------------------------------------------
; ----- ARMops for ARM9 and the like ---------------------------------------
; --------------------------------------------------------------------------

; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with
; register 7, lockdown available (format A)
;
; Note that ARM920 etc have writeback/writethrough data cache selectable
; by MMU regions. For simpliciity, we assume cacheable pages are mostly
; writeback. Any writethrough pages will have redundant clean operations
; applied when moved, for example, but this is a small overhead (cleaning
; a clean line is very quick on ARM 9).

Cache_CleanAll_WB_CR7_LDa ROUT
;
; only guarantees to clean lines not involved in interrupts (so we can
; clean without disabling interrupts)
;
; Clean cache by traversing all segment and index values
; As a concrete example, for ARM 920 (16k+16k caches) we would have:
;
;    DCache_LineLen       = 32         (32 byte cache line, segment field starts at bit 5)
;    DCache_IndexBit      = &04000000  (index field starts at bit 26)
;    DCache_IndexSegStart = &000000E0  (start at index=0, segment = 7)
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c10, 2           ; clean DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        Pull    "a2, ip"
        MOV     pc, lr


Cache_CleanInvalidateAll_WB_CR7_LDa ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c14, 2           ; clean&invalidate DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        MCR     p15, 0, ip, c7, c5, 0            ; invalidate ICache
        Pull    "a2, ip"
        MOV     pc, lr


Cache_InvalidateAll_WB_CR7_LDa ROUT
;
; no clean, assume caller knows what's happening
;
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate ICache and DCache
        MOV     pc, lr


Cache_RangeThreshold_WB_CR7_LDa ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_CR7_LDa ROUT
MMU_ChangingUncached_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
TLB_InvalidateEntry_WB_CR7_LDa ROUT
MMU_ChangingUncachedEntry_WB_CR7_LDa
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_LDa ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
        MOV     pc, lr


IMB_Full_WB_CR7_LDa ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache
;
        Push    "lr"
        BL      Cache_CleanAll_WB_CR7_LDa       ; also drains Wbuffer
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache
        Pull    "pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_LDa ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_CR7_LDa
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry by VA
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer
        Pull    "pc"

MMU_Changing_WB_CR7_LDa ROUT
        Push    "lr"
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_LDa ROUT
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     lr, #0
        MCR     p15, 0, lr, c7, c10, 4          ; drain WBuffer
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_LDa ROUT
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1             ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        MOV     pc, lr


; --------------------------------------------------------------------------
; ----- ARMops for StrongARM and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_Crd is Writeback data cache, clean by reading data from cleaner area

; Currently no support for mini data cache on some StrongARM variants. Mini
; cache is always writeback and must have cleaning support, so is very
; awkward to use for cacheable screen, say.

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
;   DCache_CleanBaseAddress   : start address of total cleaner space
;   DCache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


Cache_CleanAll_WB_Crd ROUT
;
; - cleans data cache (and invalidates it as a side effect)
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of StrongARM ops for discussion of strategy
;

        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer
        Pull    "a2-a4, v1, v2, pc"


Cache_CleanInvalidateAll_WB_Crd ROUT
IMB_Full_WB_Crd
;
;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not
;involved in interrupts - this is sufficient for OS requirements, and means we don't
;have to disable interrupts for possibly slow clean
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        Pull    "pc"

Cache_InvalidateAll_WB_Crd
;
; no clean, assume caller knows what is happening
;
        MCR     p15, 0, a1, c7, c7, 0               ;flush ICache and DCache
        MCR     p15, 0, a1, c7, c10, 4              ;drain WBuffer
        MOV     pc, lr

Cache_RangeThreshold_WB_Crd
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr

TLB_InvalidateAll_WB_Crd
MMU_ChangingUncached_WB_Crd
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr

TLB_InvalidateEntry_WB_Crd
MMU_ChangingUncachedEntry_WB_Crd
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        MOV     pc, lr

WriteBuffer_Drain_WB_Crd
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MOV     pc, lr


IMB_Range_WB_Crd ROUT
        SUB     a2, a2, a1
        CMP     a2, #64*1024                       ;arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Crd
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        Pull    "pc"

MMU_Changing_WB_Crd
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0               ;flush ITLB and DTLB
        Pull    "pc"

MMU_ChangingEntry_WB_Crd ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, pc"

MMU_ChangingEntries_WB_Crd ROUT
;
;same comments as MMU_ChangingEntry_WB_Crd
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MOV     a1, lr                             ;restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanAll_WB_Crd              ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Crd ROUT
        CMP     a2, #32                            ;arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        SUBS    lr, lr, #1
        BNE     %BT10
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr


; ARMops for XScale, mjs Feb 2001
;
; WB_Cal_LD is writeback, clean with allocate, lockdown
;
; If the mini data cache is used (XScaleMiniCache true), it is assumed to be
; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow
; mini cache clean for things like IMB_Full.
;
; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the
; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts
; off (else the main cache may be slightly dirty from interrupts, and the invalidate
; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow
; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate
; (and perhaps a line allocate) for the mini cache.
;
; We do not use lockdown.
;
; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough
; pages will be invalidated as if they were writeback, but there is little overhead
; (cleaning a clean line or allocating a line from cleaner area are both fast).

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
; If the mini cache is used, it has its own equivalent cleaner space and algorithm.
; Parameters for each cache are:
;
;    Cache_CleanBaseAddress   : start address of total cleaner space
;    Cache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


                 GBLL XScaleMiniCache  ; *must* be configured writethrough if used
XScaleMiniCache  SETL {FALSE}


; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect
; corrupts a1
;
        MACRO
        CPWAIT
        MRC      p15, 0, a1, c2, c0, 0               ; arbitrary read of CP15
        MOV      a1, a1                              ; wait for it
        ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller
        MEND


Cache_CleanAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates as a side effect)
; - if mini cache is in use, will be writethrough so no clean required
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of XScale ops for discussion of strategy
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        MCR     p15, 0, v1, c7, c2, 5               ; allocate address from cleaner space
        ADD     v1, v1, a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "a2-a4, v1, v2, pc"

  [ XScaleMiniCache

Cache_MiniInvalidateAll_WB_Cal_LD ROUT
;
; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and
; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough)
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #MCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+MCache_CleanNextAddr
        LDR     a3, [lr, #MCache_Size]
        LDRB    a4, [lr, #MCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4                        ; read a line of cleaner data
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        ; note, no drain WBuffer, since we are really only invalidating a writethrough cache
        Pull    "a2-a4, v1, v2, pc"

  ] ; XScaleMiniCache


Cache_CleanInvalidateAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates wrt OS stuff as a side effect)
; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
  [ XScaleMiniCache
        BL      Cache_MiniInvalidateAll_WB_Cal_LD
  ]
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


Cache_InvalidateAll_WB_Cal_LD ROUT
;
; no clean, assume caller knows what's happening
;
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate DCache, (MiniCache), ICache and BTB
        CPWAIT
        MOV     pc, lr


Cache_RangeThreshold_WB_Cal_LD ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_Cal_LD ROUT
MMU_ChangingUncached_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr


TLB_InvalidateEntry_WB_Cal_LD ROUT
MMU_ChangingUncachedEntry_WB_Cal_LD
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        CPWAIT
        MOV     pc, lr


WriteBuffer_Drain_WB_Cal_LD ROUT
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer (waits, so no need for CPWAIT)
        MOV     pc, lr


IMB_Full_WB_Cal_LD
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD             ; clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


IMB_Range_WB_Cal_LD ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Cal_LD
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0            ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6            ; invalidate BTB
 ]
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "pc"


MMU_Changing_WB_Cal_LD ROUT
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "pc"

MMU_ChangingEntry_WB_Cal_LD ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1          ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1           ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate BTB
 ]
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        CPWAIT
        Pull    "a2, pc"


MMU_ChangingEntries_WB_Cal_LD ROUT
;
;same comments as MMU_ChangingEntry_WB_Cal_LD
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
 ]
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0              ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6              ; invalidate BTB
 ]
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        CPWAIT
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Cal_LD ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        SUBS    lr, lr, #1
        ADD     a1, a1, #PageSize
        BNE     %BT10
        CPWAIT
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr

 [ MEMM_Type = "VMSAv6" ; Need appropriate myIMB, etc. implementations if this is to be removed

; --------------------------------------------------------------------------
; ----- ARMops for Cortex-A8 and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with
; register 7, and (potentially) multiple cache levels
;
; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length
; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length
; DCache_RangeThreshold = clean threshold for data cache
; Cache_Lx_Info = Cache level ID register
; Cache_Lx_DTable = Cache size identification register for all 8 data/unified caches
; Cache_Lx_ITable = Cache size identification register for all 8 instruction caches

Cache_CleanAll_WB_CR7_Lx ROUT
; Clean cache by traversing all sets and ways for all data caches
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c10, 2 ; Clean
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        myDSB   ,a1 ; Wait for cache cleaning to complete
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_CleanInvalidateAll_WB_CR7_Lx ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c14, 2 ; Clean & invalidate
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for cache clean to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_InvalidateAll_WB_CR7_Lx ROUT
;
; no clean, assume caller knows what's happening
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c6, 2 ; Invalidate
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for invalidation to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_RangeThreshold_WB_CR7_Lx ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


MMU_ChangingUncached_WB_CR7_Lx
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
TLB_InvalidateAll_WB_CR7_Lx ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_WB_CR7_Lx
      [ NoARMv7
        Push    "a2"
        myDSB   ,a2    ; Ensure the page table write has actually completed
        myISB   ,a2,,y ; Also required
        Pull    "a2"
      |
        myDSB
        myISB
      ]
TLB_InvalidateEntry_WB_CR7_Lx ROUT
        MCR     p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_Lx ROUT
        myDSB   ,a1    ; DSB is the new name for write buffer draining
        myISB   ,a1,,y ; Also do ISB for extra paranoia
        MOV     pc, lr


IMB_Full_WB_CR7_Lx ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor
; Luckily, we only need to clean as far as the level of unification
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        MOV     a1, a1, LSR #27
        AND     a1, a1, #&7 ; Get level of unification
        MOV     a2, #0 ; Current cache level
        SUBS    a1, a1, #1
        BLT     %FT10 ; Cache clean complete
20
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c10, 2 ; Clean
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        SUBS    a1, a1, #1
        BGE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for clean to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_Lx ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take
        ADD     a2, a2, a1
        CMPLO   a1, a2 ; The routine below will fail if the end address wraps around, so just IMB_Full instead
        BHS     IMB_Full_WB_CR7_Lx
        Push    "a1,a3,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a3, #4
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c11, 1           ; clean DCache entry by VA to PoU
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        myDSB   ,a1  ; Wait for clean to complete
        Pull    "a1" ; Get start address back
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a3,pc"

MMU_Changing_WB_CR7_Lx ROUT
        Push    "lr"
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_Lx ROUT
        Push    "a2, lr"
        myDSB   ,lr ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a2, #4
        MOV     lr, a2, LSL lr
        ADD     a2, a1, #PageSize
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,lr ; Wait for clean to complete
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     lr, a1, LSL lr
        SUB     a1, a2, #PageSize ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c7, 1           ; invalidate DTLB and ITLB
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_Lx ROUT
        Push    "a2, a3, lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     lr, #4
        MOV     a3, lr, LSL a3
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,a3 ; Wait for clean to complete
        LDR     a3, =ZeroPage
        LDRB    a3, [a3, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     a3, a1, LSL a3
        MOV     a1, lr ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
20
        MCR     p15, 0, lr, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     lr, lr, #PageSize
        CMP     lr, a2
        BNE     %BT20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT
        Push    "a2,lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        CMP     a2, #32                            ; arbitrary-ish threshold
        MCRHS   p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        BHS     %FT20
10
        MCR     p15, 0, a1, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,lr,,y
        myISB   ,lr,,y
        Pull    "a2,pc"

 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------


;        IMPORT  Write0_Translated

ARM_PrintProcessorType
        LDR     a1, =ZeroPage
        LDRB    a1, [a1, #ProcessorType]
        TEQ     a1, #ARMunk
        MOVEQ   pc, lr

        Push    "lr"
        ADR     a2, PNameTable
        LDHA    a1, a2, a1, a3
        ADD     a1, a2, a1
        BL      Write0_Translated
        SWI     XOS_NewLine
        SWI     XOS_NewLine
        Pull    "pc"

PNameTable
        DCW     PName_ARM600    - PNameTable
        DCW     PName_ARM610    - PNameTable
        DCW     PName_ARM700    - PNameTable
        DCW     PName_ARM710    - PNameTable
        DCW     PName_ARM710a   - PNameTable
        DCW     PName_SA110     - PNameTable      ; pre rev T
        DCW     PName_SA110     - PNameTable      ; rev T or later
        DCW     PName_ARM7500   - PNameTable
        DCW     PName_ARM7500FE - PNameTable
        DCW     PName_SA1100    - PNameTable
        DCW     PName_SA1110    - PNameTable
        DCW     PName_ARM720T   - PNameTable
        DCW     PName_ARM920T   - PNameTable
        DCW     PName_ARM922T   - PNameTable
        DCW     PName_X80200    - PNameTable
        DCW     PName_X80321    - PNameTable
        DCW     PName_Cortex_A8 - PNameTable
        DCW     PName_ARM1176JZF_S - PNameTable

PName_ARM600
        =       "600:ARM 600 Processor",0
PName_ARM610
        =       "610:ARM 610 Processor",0
PName_ARM700
        =       "700:ARM 700 Processor",0
PName_ARM710
        =       "710:ARM 710 Processor",0
PName_ARM710a
        =       "710a:ARM 710a Processor",0
PName_SA110
        =       "SA110:SA-110 Processor",0
PName_ARM7500
        =       "7500:ARM 7500 Processor",0
PName_ARM7500FE
        =       "7500FE:ARM 7500FE Processor",0
PName_SA1100
        =       "SA1100:SA-1100 Processor",0
PName_SA1110
        =       "SA1110:SA-1110 Processor",0
PName_ARM720T
        =       "720T:ARM 720T Processor",0
PName_ARM920T
        =       "920T:ARM 920T Processor",0
PName_ARM922T
        =       "922T:ARM 922T Processor",0
PName_X80200
        =       "X80200:80200 Processor",0
PName_X80321
        =       "X80321:80321 Processor",0
PName_Cortex_A8
        =       "CortexA8:Cortex-A8 Processor",0
PName_ARM1176JZF_S
        =       "ARM1176JZF_S:ARM1176JZF-S Processor",0
        ALIGN


; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0)
; to XCB bits in page table descriptors.

XCB_NB  *       1:SHL:0
XCB_NC  *       1:SHL:1
XCB_P   *       1:SHL:2

        ALIGN 32

; WT read-allocate cache (eg ARM720T)
XCBTableWT                                      ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B, L2_C, L2_B, 0              ;        Default
        = L2_C+L2_B, L2_C, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X

; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB
XCBTableSA110
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T)
XCBTableWBR
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        = L2_C     ,    0, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; SA-1110 - WB only read allocate cache, merging WB, mini D-cache
XCBTableSA1110
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0,    0, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C     ,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to read-allocate
XCBTableXScaleRA
        =      L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         X,  Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      X,  Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  X,           X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to write-allocate
XCBTableXScaleWA
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         X,  Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      X,  Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  X,           X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

        END