ARMops

; Copyright 2000 Pace Micro Technology plc
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
  ;      GET     Hdr:ListOpts
  ;      GET     Hdr:Macros
  ;      GET     Hdr:System
  ;      $GetCPU
  ;      $GetMEMM

  ;      GET     hdr.Options

  ;      GET     Hdr:PublicWS
  ;      GET     Hdr:KernelWS

  ;      GET     hdr.Copro15ops
  ;      GET     hdr.ARMops

v7      RN      10

  ;      EXPORT  Init_ARMarch
  ;      EXPORT  ARM_Analyse
  ;      EXPORT  ARM_PrintProcessorType

 ;       AREA    KernelCode,CODE,READONLY

; ARM keep changing their mind about ID field layout.
; Here's a summary, courtesy of the ARM ARM (v5):
;
; pre-ARM 7:   xxxx0xxx
; ARM 7:       xxxx7xxx where bit 23 indicates v4T/~v3
; post-ARM 7:  xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T)
;

; int Init_ARMarch(void)
; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later.
; Corrupts only ip, no RAM usage.
Init_ARMarch
        ARM_read_ID ip
        ANDS    a1, ip, #&0000F000
        MOVEQ   pc, lr                          ; ARM 3 or ARM 6
        TEQ     a1, #&00007000
        BNE     %FT20
        TST     ip, #&00800000                  ; ARM 7 - check for Thumb
        MOVNE   a1, #ARMv4T
        MOVEQ   a1, #ARMv3
        MOV     pc, lr
20      ANDS    a1, ip, #&000F0000              ; post-ARM 7
        MOV     a1, a1, LSR #16
        MOV     pc, lr


ARM_Analyse
        MOV     a2, lr
        BL      Init_ARMarch
        MOV     lr, a2
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        BEQ     ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup
 ]
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        ARM_read_cachetype v2
        LDR     v6, =ZeroPage

        ADRL    v7, KnownCPUTable
FindARMloop
        LDMIA   v7!, {a1, a2}                   ; See if it's a known ARM
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     FindARMloop
        TEQ     v2, v1                          ; If we don't have cache attributes, read from table
        LDREQ   v2, [v7]

20      TEQ     v2, v1
        BEQ     %BT20                           ; Cache unknown: panic

        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        ASSERT  CT_Isize_pos = 0
        MOV     a1, v2
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        [ No26bitCode
        MOV     v5, #CPUFlag_32bitOS
        |
        MOV     v5, #0
        ]
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        TST     v2, #CT_S
        ORRNE   v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas

        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        ; Test abort timing (base restored or base updated)
        MOV     a1, #&8000
        LDR     a2, [a1], #4                    ; Will abort - DAb handler will continue execution
        TEQ     a1, #&8000
        ORREQ   v5, v5, #CPUFlag_BaseRestored

        ; Check store of PC
30      STR     pc, [sp, #-4]!
        ADR     a2, %BT30 + 8
        LDR     a1, [sp], #4
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_StorePCplus8

        [ 0=1
        ; Check whether 26-bit mode is available
        MSR     CPSR_c, #F32_bit+I32_bit+SVC26_mode
        MRS     a1, CPSR
        AND     a1, a1, #M32_bits
        TEQ     a1, #SVC26_mode
        ORRNE   v5, v5, #CPUFlag_No26bitMode
        MSREQ   CPSR_c, #F32_bit+I32_bit+SVC32_mode
        BNE     %FT35

        ; Do we get vector exceptions on read?
        LDR     a2, =ZeroPage
        MOV     a1, a2
        LDR     a1, [a1]                        ; If this aborts a1 will be left unchanged
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_VectorReadException
        ]
35

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        TEQ     a1, #ARMv3                      ; assume long multiply available
        ORRNE   v5, v5, #CPUFlag_LongMul        ; if v4 or later
        TEQNE   a1, #ARMv4                      ; assume 26-bit available
        ORRNE   v5, v5, #CPUFlag_No26bitMode    ; iff v3 or v4 (not T)
        TEQNE   a1, #ARMv5                      ; assume Thumb available
        ORRNE   v5, v5, #CPUFlag_Thumb          ; iff not v3,v4,v5

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

 [ XScaleJTAGDebug
        TST     v5, #CPUFlag_XScale
        BEQ     %FT40

        MRC     p14, 0, a2, c10, c0             ; Read debug control register
        TST     a2, #&80000000
        ORRNE   v5, v5, #CPUFlag_XScaleJTAGconnected
        MOVEQ   a2, #&C000001C                  ; enable hot debug
        MCREQ   p14, 0, a2, c10, c0
        BNE     %FT40
40
 ]

        STR     v5, [v6, #ProcessorFlags]

        ; Now, a1 = processor architecture (ARMv3, ARMv4 ...)
        ;      v4 = processor type (ARM600, ARM610, ...)
        ;      v5 = processor flags

        CMP     a1, #ARMv4
        BLO     Analyse_ARMv3                   ; eg. ARM710

        LDRB    a2, [v6, #Cache_Type]
        TEQ     a2, #CT_ctype_WT
        TSTEQ   v5, #CPUFlag_SplitCache
        BEQ     Analyse_WriteThroughUnified     ; eg. ARM7TDMI derivative

        TEQ     a2, #CT_ctype_WB_CR7_LDa
        BEQ     Analyse_WB_CR7_LDa              ; eg. ARM9

        TEQ     a2, #CT_ctype_WB_Crd
        BEQ     Analyse_WB_Crd                  ; eg. StrongARM

        TEQ     a2, #CT_ctype_WB_Cal_LD
        BEQ     Analyse_WB_Cal_LD               ; assume XScale

        ; others ...

WeirdARMPanic
        B       WeirdARMPanic                   ; stiff :)

Analyse_ARMv3
        ADRL    a1, NullOp
        ADRL    a2, Cache_Invalidate_ARMv3
        ADRL    a3, WriteBuffer_Drain_ARMv3
        ADRL    a4, TLB_Invalidate_ARMv3
        ADRL    ip, TLB_InvalidateEntry_ARMv3

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_ARMv3
        ADRL    a2, MMU_ChangingEntry_ARMv3
        ADRL    a3, MMU_ChangingUncached_ARMv3
        ADRL    a4, MMU_ChangingUncachedEntry_ARMv3
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_ARMv3
        ADRL    a2, MMU_ChangingUncachedEntries_ARMv3
        ADRL    a3, Cache_RangeThreshold_ARMv3
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WriteThroughUnified
        ADRL    a1, NullOp
        ADRL    a2, Cache_InvalidateUnified
        TST     v5, #CPUFlag_NoWBDrain
        ADRNEL  a3, WriteBuffer_Drain_OffOn
        ADREQL  a3, WriteBuffer_Drain
        ADRL    a4, TLB_Invalidate_Unified
        ADRL    ip, TLB_InvalidateEntry_Unified

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_Writethrough
        ADRL    a2, MMU_ChangingEntry_Writethrough
        ADRL    a3, MMU_ChangingUncached
        ADRL    a4, MMU_ChangingUncachedEntry
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_Writethrough
        ADRL    a2, MMU_ChangingUncachedEntries
        ADRL    a3, Cache_RangeThreshold_Writethrough
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WB_CR7_LDa
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here (eg. ARM920)

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_LDa
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDRB    a2, [v6, #DCache_Associativity]

        MOV     a3, #256
        MOV     a4, #8           ; to find log2(ASSOC), rounded up
Analyse_WB_CR7_LDa_L1
        MOV     a3, a3, LSR #1
        SUB     a4, a4, #1
        CMP     a2, a3
        BLO     Analyse_WB_CR7_LDa_L1
        ADDHI   a4, a4, #1

        RSB     a2, a4, #32
        MOV     a3, #1
        MOV     a3, a3, LSL a2
        STR     a3, [v6, #DCache_IndexBit]
        LDR     a4, [v6, #DCache_NSets]
        LDRB    a2, [v6, #DCache_LineLen]
        SUB     a4, a4, #1
        MUL     a4, a2, a4
        STR     a4, [v6, #DCache_IndexSegStart]

        MOV     a2, #64*1024                         ; arbitrary-ish
        STR     a2, [v6, #DCache_RangeThreshold]

        ADRL    a1, XCBTableWBR                      ; assume read-allocate WB/WT cache
        STR     a1, [v6, #MMU_PCBTrans]

        B       %FT90

Analyse_WB_Crd
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Crd
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Crd
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Crd
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Crd
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_Crd
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]
        MOV     a2, #64*1024                       ;arbitrary-ish threshold
        STR     a2, [v6, #DCache_RangeThreshold]

        LDRB    a2, [v6, #ProcessorType]
        TEQ     a2, #SA110
        ADREQL  a2, XCBTableSA110
        BEQ     Analyse_WB_Crd_finish
        TEQ     a2, #SA1100
        TEQNE   a2, #SA1110
        ADREQL  a2, XCBTableSA1110
        ADRNEL  a2, XCBTableWBR
Analyse_WB_Crd_finish
        STR     a2, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WB_Cal_LD
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Cal_LD
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]

  [ XScaleMiniCache
        !       1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable"
        LDR     a2, =DCacheCleanAddress + 4 * 32*1024
        STR     a2, [v6, #MCache_CleanBaseAddress]
        STR     a2, [v6, #MCache_CleanNextAddress]
  ]


  ; arbitrary-ish values, mini cache makes global op significantly more expensive
  [ XScaleMiniCache
        MOV     a2, #128*1024
  |
        MOV     a2, #32*1024
  ]
        STR     a2, [v6, #DCache_RangeThreshold]

        ; enable full coprocessor access
        LDR     a2, =&3FFF
        MCR     p15, 0, a2, c15, c1

        ADRL    a2, XCBTableXScaleWA ; choose between RA and WA here
        STR     a2, [v6, #MMU_PCBTrans]

        B       %FT90

 [ MEMM_Type = "VMSAv6"
Analyse_WB_CR7_Lx
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here

        ; Read the cache info into Cache_Lx_*
        MRC     p15, 1, a1, c0, c0, 1 ; Cache level ID register
        MOV     v2, v6 ; Work around DTable/ITable alignment issues
        STR     a1, [v2, #Cache_Lx_Info]!
        ADD     a1, v2, #Cache_Lx_DTable-Cache_Lx_Info
        ADD     a2, v2, #Cache_Lx_ITable-Cache_Lx_Info
        MOV     a3, #0
        MOV     a4, #256 ; Smallest instruction cache line length
        MOV     v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length)
10
        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (data/unified)
        STR     v1, [a1],#4
        CMP     v1, #0 ; Does the cache exist?
        AND     v1, v1, #7 ; Get line size
        CMPNE   v1, v2
        MOVLT   v2, v1 ; Earlier CMP will not set LE flags if v1=0
        ADD     a3, a3, #1
        MCR     p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        MRC     p15, 1, v1, c0, c0, 0 ; Get size info (instruction)
        STR     v1, [a2],#4
        CMP     v1, #0 ; Does the cache exist?
        AND     v1, v1, #7 ; Get line size
        CMPNE   v1, a4
        MOVLT   a4, v1 ; Earlier CMP will not set LE flags if v1=0
        ADD     a3, a3, #1
        CMP     a3, #16
        BLT     %BT10
        STRB    a4, [v6, #ICache_LineLen] ; Store log2(line size)-2
        STRB    v2, [v6, #DCache_LineLen] ; log2(line size)-2

        ; Calculate DCache_RangeThreshold
        MOV     a1, #128*1024 ; Arbitrary-ish
        STR     a1, [v6, #DCache_RangeThreshold]

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_Lx
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, MMU_Changing_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        ADRL    a1, XCBTableWBR                      ; assume read-allocate WB/WT cache
        STR     a1, [v6, #MMU_PCBTrans]

	; Enable L2 cache. This could probably be moved earlier on in the boot sequence
	; (e.g. when the MMU is turned on), but for now it will go here to reduce the chances
	; of stuff breaking
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx ; Ensure L2 cache is clean

    [ M_CortexA9
	; write access to ACTLR is only permitted in Secure Mode
	; so we must use smc API calls
	STMFD	sp!, {a2-a4,v3-v4,ip}
	LDR	ip, =0x102	; enable/disable PL310 L2 Cache controller
	MOV	a1, #1		; enable
	myDSB
	DCI	0xE1600070	; SMC #0
	LDMFD	sp!, {a2-a4,v3-v4,ip}
    |
        MRC     p15, 0, a1, c1, c0, 1
        ORR     a1, a1, #2 ; L2EN
        MCR     p15, 0, a1, c1, c0, 1
    ] ; M_CortexA9

        B       %FT90
 ] ; MEMM_Type = "VMSAv6"

90
        Pull    "v1,v2,v5,v6,v7,pc"


; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined
; in section B2.3.3 of the ARMv5 ARM.
EvaluateCache
        AND     a3, a1, #CT_assoc_mask+CT_M
        TEQ     a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M
        BEQ     %FT80
        MOV     ip, #1
        ASSERT  CT_len_pos = 0
        AND     a4, a1, #CT_len_mask
        ADD     a4, a4, #3
        MOV     a4, ip, LSL a4                  ; LineLen = 1 << (len+3)
        STRB    a4, [a2, #ICache_LineLen-ICache_Info]
        MOV     a3, #2
        TST     a1, #CT_M
        ADDNE   a3, a3, #1                      ; Multiplier = 2 + M
        AND     a4, a1, #CT_assoc_mask
        RSB     a4, ip, a4, LSR #CT_assoc_pos
        MOV     a4, a3, LSL a4                  ; Associativity = Multiplier << (assoc-1)
        STRB    a4, [a2, #ICache_Associativity-ICache_Info]
        AND     a4, a1, #CT_size_mask
        MOV     a4, a4, LSR #CT_size_pos
        MOV     a3, a3, LSL a4
        MOV     a3, a3, LSL #8                  ; Size = Multiplier << (size+8)
        STR     a3, [a2, #ICache_Size-ICache_Info]
        ADD     a4, a4, #6
        AND     a3, a1, #CT_assoc_mask
        SUB     a4, a4, a3, LSR #CT_assoc_pos
        AND     a3, a1, #CT_len_mask
        ASSERT  CT_len_pos = 0
        SUB     a4, a4, a3
        MOV     a4, ip, LSL a4                  ; NSets = 1 << (size + 6 - assoc - len)
        STR     a4, [a2, #ICache_NSets-ICache_Info]
        MOV     pc, lr


80      MOV     a1, #0
        STR     a1, [a2, #ICache_NSets-ICache_Info]
        STR     a1, [a2, #ICache_Size-ICache_Info]
        STRB    a1, [a2, #ICache_LineLen-ICache_Info]
        STRB    a1, [a2, #ICache_Associativity-ICache_Info]
        MOV     pc, lr


; Create a list of CPUs, 16 bytes per entry:
;    ID bits (1 word)
;    Test mask for ID (1 word)
;    Cache type register value (1 word)
;    Processor type (1 byte)
;    Architecture type (1 byte)
;    Reserved (2 bytes)
        GBLA    tempcpu

        MACRO
        CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln
        LCLA    type
type    SETA    (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos)
tempcpu CSzDesc $dsz, $das, $dln
type    SETA    type+(tempcpu:SHL:CT_Dsize_pos)
        [ :LNOT:($s=0 :LAND: "$isz"="")
tempcpu CSzDesc $isz, $ias, $iln
        ]
type    SETA    type+(tempcpu:SHL:CT_Isize_pos)
        ASSERT  ($id :AND: :NOT: $mask) = 0
        DCD     $id, $mask, type
        DCB     $proc, $arch, 0, 0
        MEND

        MACRO
$var    CSzDesc $sz, $as, $ln
$var    SETA    (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos)
$var    SETA    $var+(CT_M_$sz:SHL:CT_M_pos)
        MEND


; CPUDesc table for ARMv3-ARMv6
KnownCPUTable
;                                                        /------Cache Type register fields-----\
;                              ID reg   Mask     Arch    Type         S  Dsz Das Dln Isz Ias Iln
        CPUDesc ARM600,        &000600, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM610,        &000610, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARMunk,        &000000, &00F000, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM700,        &007000, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710,        &007100, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710a,       &047100, &FDFFF0, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM7500,       &067100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARM7500FE,     &077100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARMunk,        &007000, &80F000, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM720T,       &807200, &FFFFF0, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc ARMunk,        &807000, &80F000, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA110,         &01A100, &0FFFF0, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA1100,        &01A110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc SA1110,        &01B110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc ARM920T,       &029200, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8
        CPUDesc ARM922T,       &029220, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1,  8K, 64, 8,  8K, 64, 8
        CPUDesc X80200,        &052000, &0FFFF0, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
        CPUDesc X80321,    &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
        DCD     -1

; Simplified CPUDesc table for ARMvF
; The cache size data is ignored for ARMv7.
KnownCPUTable_Fancy
        CPUDesc Cortex_A8,     &00C080, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32, 16, 16K, 32, 16
	CPUDesc Cortex_A9,     &00C090, &00FFF0, ARMvF, WB_CR7_Lx, 1, 32K, 32, 16, 32K, 32, 16
        CPUDesc ARM1176JZF_S,  &00B760, &00FFF0, ARMv6, WB_CR7_LDa, 1, 16K, 32, 16,16K, 32, 16
        DCD     -1

; Peculiar characteristics of individual ARMs not deducable otherwise. First field is
; flags to set, second flags to clear.
KnownCPUFlags
        DCD     0,                            0    ; ARM 600
        DCD     0,                            0    ; ARM 610
        DCD     0,                            0    ; ARM 700
        DCD     0,                            0    ; ARM 710
        DCD     0,                            0    ; ARM 710a
        DCD     CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay,   0    ; SA 110 pre revT
        DCD     CPUFlag_InterruptDelay,       0    ; SA 110 revT or later
        DCD     0,                            0    ; ARM 7500
        DCD     0,                            0    ; ARM 7500FE
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1100
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1110
        DCD     CPUFlag_NoWBDrain,            0    ; ARM 720T
        DCD     0,                            0    ; ARM 920T
        DCD     0,                            0    ; ARM 922T
        DCD     CPUFlag_ExtendedPages+CPUFlag_XScale,  0    ; X80200
        DCD     CPUFlag_XScale,               0    ; X80321
        DCD     0,                            0    ; Cortex_A8
	DCD	0,			      0    ; Cortex_A9
        DCD     0,                            0    ; ARM1176JZF_S

 [ MEMM_Type = "VMSAv6"
; --------------------------------------------------------------------------
; ----- ARM_Analyse_Fancy --------------------------------------------------
; --------------------------------------------------------------------------
;
; For ARMv7 ARMs (arch=&F), we can detect everything via the feature registers
; TODO - There's some stuff in here that can be tidied up/removed

; Things we need to set up:
; ProcessorType     (as listed in hdr.ARMops)
; Cache_Type        (CT_ctype_* from hdr:MEMM.ARM600)
; ProcessorArch     (as reported by Init_ARMarch)
; ProcessorFlags    (CPUFlag_* from hdr.ARMops)
; Proc_*            (Cache/TLB/IMB/MMU function pointers)
; MMU_PCBTrans      (Points to lookup table for translating page table cache options)
; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?)

ARM_Analyse_Fancy
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        LDR     v6, =ZeroPage
        ADRL    v7, KnownCPUTable_Fancy
10
        LDMIA   v7!, {a1, a2}
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     %BT10
20
        LDR     v2, [v7]
        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        MOV     v5, #CPUFlag_32bitOS+CPUFlag_No26bitMode ; 26bit has been obsolete for a long time
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        ; Work out whether the cache info is in ARMv6 or ARMv7 style
        MRC     p15, 0, a1, c0, c0, 1
        TST     a1, #&80000000
        BNE     %FT25

        ; ARMv6 format cache type register.
        ; TODO - Use the cache type register to deduce the cache info.
        ; For now, just fall back on the values in the CPU table.
        ASSERT  CT_Isize_pos = 0
        MOV     a1, v2
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache
        B       %FT27

25
	; ARMv7 format cache type register.
	; This should(!) mean that we have the cache level ID register,
	; and all the other ARMv7 cache registers.

        ; Do we have a split cache?
        MRC     p15, 1, a1, c0, c0, 1
        AND     a2, a1, #7
        TEQ     a2, #3
        ORREQ   v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache

27
        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        ; Test abort timing (base restored or base updated)
        MOV     a1, #&8000
        LDR     a2, [a1], #4                    ; Will abort - DAb handler will continue execution
        TEQ     a1, #&8000
        ORREQ   v5, v5, #CPUFlag_BaseRestored

        ; Check store of PC
30      STR     pc, [sp, #-4]!
        ADR     a2, %BT30 + 8
        LDR     a1, [sp], #4
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_StorePCplus8

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        MRC     p15, 0, a1, c0, c2, 2
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_LongMul

        MRC     p15, 0, a1, c0, c1, 0
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_Thumb

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this?

        ; Other flags not checked for above:
        ; CPUFlag_InterruptDelay
        ; CPUFlag_VectorReadException
        ; CPUFlag_ExtendedPages
        ; CPUFlag_NoWBDrain
        ; CPUFlag_AbortRestartBroken
        ; CPUFlag_XScale
        ; CPUFlag_XScaleJTAGconnected

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

        STR     v5, [v6, #ProcessorFlags]

        ; Cache analysis

        LDRB    a2, [v6, #Cache_Type]
        TEQ     a2, #CT_ctype_WT
        TSTEQ   v5, #CPUFlag_SplitCache
        BEQ     Analyse_WriteThroughUnified     ; eg. ARM7TDMI derivative

        TEQ     a2, #CT_ctype_WB_CR7_LDa
        BEQ     Analyse_WB_CR7_LDa              ; eg. ARM9

        TEQ     a2, #CT_ctype_WB_Crd
        BEQ     Analyse_WB_Crd                  ; eg. StrongARM

        TEQ     a2, #CT_ctype_WB_Cal_LD
        BEQ     Analyse_WB_Cal_LD               ; assume XScale

        TEQ     a2, #CT_ctype_WB_CR7_Lx
        BEQ     Analyse_WB_CR7_Lx

        ; others ...

        B       WeirdARMPanic                   ; stiff :)
 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------
; ----- ARMops -------------------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMops are the routines required by the kernel for cache/MMU control
; the kernel vectors to the appropriate ops for the given ARM at boot
;
; The Rules:
;   - These routines may corrupt a1 and lr only
;   - (lr can of course only be corrupted whilst still returning to correct
;     link address)
;   - stack is available, at least 16 words can be stacked
;   - a NULL op would be a simple MOV pc, lr
;

; --------------------------------------------------------------------------
; ----- ARMops for ARMv3 ---------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMv3 ARMs include ARM710, ARM610, ARM7500
;

Cache_Invalidate_ARMv3
        MCR     p15, 0, a1, c7, c0
NullOp  MOV     pc, lr

WriteBuffer_Drain_ARMv3
        ;swap always forces unbuffered write, stalling till WB empty
        SUB     sp, sp, #4
        SWP     a1, a1, [sp]
        ADD     sp, sp, #4
        MOV     pc, lr

TLB_Invalidate_ARMv3
        MCR     p15, 0, a1, c5, c0
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_ARMv3
        MCR     p15, 0, a1, c6, c0
        MOV     pc, lr

MMU_Changing_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached_ARMv3
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_ARMv3
        Push    "a2"
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_ARMv3
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached_ARMv3
        Push    "a2"
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_ARMv3
        ! 0, "arbitrary Cache_RangeThreshold_ARMv3"
        MOV     a1, #16*PageSize
        MOV     pc, lr

        LTORG

; --------------------------------------------------------------------------
; ----- generic ARMops for simple ARMs, ARMv4 onwards ----------------------
; --------------------------------------------------------------------------
;
; eg. ARM7TDMI based ARMs, unified, writethrough cache
;

Cache_InvalidateUnified
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7
        MOV     pc, lr

WriteBuffer_Drain_OffOn
        ; used if ARM has no drain WBuffer MCR op
        Push    "a2"
        ARM_read_control a1
        BIC     a2, a1, #MMUC_W
        ARM_write_control a2
        ARM_write_control a1
        Pull    "a2"
        MOV     pc, lr

WriteBuffer_Drain
        ; used if ARM has proper drain WBuffer MCR op
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4
        MOV     pc, lr

TLB_Invalidate_Unified
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_Unified
        MCR     p15, 0, a1, c8, c7, 1
        MOV     pc, lr

MMU_Changing_Writethrough
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_Writethrough
        Push    "a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        Pull    "a4"
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_Writethrough  ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_Writethrough
        Push    "a2,a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2,a4"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_Writethrough
        ! 0, "arbitrary Cache_RangeThreshold_Writethrough"
        MOV     a1, #16*PageSize
        MOV     pc, lr

; --------------------------------------------------------------------------
; ----- ARMops for ARM9 and the like ---------------------------------------
; --------------------------------------------------------------------------

; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with
; register 7, lockdown available (format A)
;
; Note that ARM920 etc have writeback/writethrough data cache selectable
; by MMU regions. For simpliciity, we assume cacheable pages are mostly
; writeback. Any writethrough pages will have redundant clean operations
; applied when moved, for example, but this is a small overhead (cleaning
; a clean line is very quick on ARM 9).

Cache_CleanAll_WB_CR7_LDa ROUT
;
; only guarantees to clean lines not involved in interrupts (so we can
; clean without disabling interrupts)
;
; Clean cache by traversing all segment and index values
; As a concrete example, for ARM 920 (16k+16k caches) we would have:
;
;    DCache_LineLen       = 32         (32 byte cache line, segment field starts at bit 5)
;    DCache_IndexBit      = &04000000  (index field starts at bit 26)
;    DCache_IndexSegStart = &000000E0  (start at index=0, segment = 7)
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c10, 2           ; clean DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        Pull    "a2, ip"
        MOV     pc, lr


Cache_CleanInvalidateAll_WB_CR7_LDa ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c14, 2           ; clean&invalidate DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        MCR     p15, 0, ip, c7, c5, 0            ; invalidate ICache
        Pull    "a2, ip"
        MOV     pc, lr


Cache_InvalidateAll_WB_CR7_LDa ROUT
;
; no clean, assume caller knows what's happening
;
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate ICache and DCache
        MOV     pc, lr


Cache_RangeThreshold_WB_CR7_LDa ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_CR7_LDa ROUT
MMU_ChangingUncached_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
TLB_InvalidateEntry_WB_CR7_LDa ROUT
MMU_ChangingUncachedEntry_WB_CR7_LDa
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_LDa ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
        MOV     pc, lr


IMB_Full_WB_CR7_LDa ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache
;
        Push    "lr"
        BL      Cache_CleanAll_WB_CR7_LDa       ; also drains Wbuffer
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache
        Pull    "pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_LDa ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_CR7_LDa
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry by VA
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer
        Pull    "pc"

MMU_Changing_WB_CR7_LDa ROUT
        Push    "lr"
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_LDa ROUT
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     lr, #0
        MCR     p15, 0, lr, c7, c10, 4          ; drain WBuffer
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_LDa ROUT
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1             ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        MOV     pc, lr


; --------------------------------------------------------------------------
; ----- ARMops for StrongARM and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_Crd is Writeback data cache, clean by reading data from cleaner area

; Currently no support for mini data cache on some StrongARM variants. Mini
; cache is always writeback and must have cleaning support, so is very
; awkward to use for cacheable screen, say.

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
;   DCache_CleanBaseAddress   : start address of total cleaner space
;   DCache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


Cache_CleanAll_WB_Crd ROUT
;
; - cleans data cache (and invalidates it as a side effect)
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of StrongARM ops for discussion of strategy
;

        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer
        Pull    "a2-a4, v1, v2, pc"


Cache_CleanInvalidateAll_WB_Crd ROUT
IMB_Full_WB_Crd
;
;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not
;involved in interrupts - this is sufficient for OS requirements, and means we don't
;have to disable interrupts for possibly slow clean
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        Pull    "pc"

Cache_InvalidateAll_WB_Crd
;
; no clean, assume caller knows what is happening
;
        MCR     p15, 0, a1, c7, c7, 0               ;flush ICache and DCache
        MCR     p15, 0, a1, c7, c10, 4              ;drain WBuffer
        MOV     pc, lr

Cache_RangeThreshold_WB_Crd
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr

TLB_InvalidateAll_WB_Crd
MMU_ChangingUncached_WB_Crd
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr

TLB_InvalidateEntry_WB_Crd
MMU_ChangingUncachedEntry_WB_Crd
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        MOV     pc, lr

WriteBuffer_Drain_WB_Crd
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MOV     pc, lr


IMB_Range_WB_Crd ROUT
        SUB     a2, a2, a1
        CMP     a2, #64*1024                       ;arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Crd
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        Pull    "pc"

MMU_Changing_WB_Crd
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0               ;flush ITLB and DTLB
        Pull    "pc"

MMU_ChangingEntry_WB_Crd ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, pc"

MMU_ChangingEntries_WB_Crd ROUT
;
;same comments as MMU_ChangingEntry_WB_Crd
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MOV     a1, lr                             ;restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanAll_WB_Crd              ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Crd ROUT
        CMP     a2, #32                            ;arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        SUBS    lr, lr, #1
        BNE     %BT10
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr


; ARMops for XScale, mjs Feb 2001
;
; WB_Cal_LD is writeback, clean with allocate, lockdown
;
; If the mini data cache is used (XScaleMiniCache true), it is assumed to be
; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow
; mini cache clean for things like IMB_Full.
;
; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the
; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts
; off (else the main cache may be slightly dirty from interrupts, and the invalidate
; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow
; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate
; (and perhaps a line allocate) for the mini cache.
;
; We do not use lockdown.
;
; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough
; pages will be invalidated as if they were writeback, but there is little overhead
; (cleaning a clean line or allocating a line from cleaner area are both fast).

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
; If the mini cache is used, it has its own equivalent cleaner space and algorithm.
; Parameters for each cache are:
;
;    Cache_CleanBaseAddress   : start address of total cleaner space
;    Cache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


                 GBLL XScaleMiniCache  ; *must* be configured writethrough if used
XScaleMiniCache  SETL {FALSE}


; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect
; corrupts a1
;
        MACRO
        CPWAIT
        MRC      p15, 0, a1, c2, c0, 0               ; arbitrary read of CP15
        MOV      a1, a1                              ; wait for it
        ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller
        MEND


Cache_CleanAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates as a side effect)
; - if mini cache is in use, will be writethrough so no clean required
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of XScale ops for discussion of strategy
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        MCR     p15, 0, v1, c7, c2, 5               ; allocate address from cleaner space
        ADD     v1, v1, a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "a2-a4, v1, v2, pc"

  [ XScaleMiniCache

Cache_MiniInvalidateAll_WB_Cal_LD ROUT
;
; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and
; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough)
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #MCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+MCache_CleanNextAddr
        LDR     a3, [lr, #MCache_Size]
        LDRB    a4, [lr, #MCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4                        ; read a line of cleaner data
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        ; note, no drain WBuffer, since we are really only invalidating a writethrough cache
        Pull    "a2-a4, v1, v2, pc"

  ] ; XScaleMiniCache


Cache_CleanInvalidateAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates wrt OS stuff as a side effect)
; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
  [ XScaleMiniCache
        BL      Cache_MiniInvalidateAll_WB_Cal_LD
  ]
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


Cache_InvalidateAll_WB_Cal_LD ROUT
;
; no clean, assume caller knows what's happening
;
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate DCache, (MiniCache), ICache and BTB
        CPWAIT
        MOV     pc, lr


Cache_RangeThreshold_WB_Cal_LD ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_Cal_LD ROUT
MMU_ChangingUncached_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr


TLB_InvalidateEntry_WB_Cal_LD ROUT
MMU_ChangingUncachedEntry_WB_Cal_LD
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        CPWAIT
        MOV     pc, lr


WriteBuffer_Drain_WB_Cal_LD ROUT
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer (waits, so no need for CPWAIT)
        MOV     pc, lr


IMB_Full_WB_Cal_LD
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD             ; clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


IMB_Range_WB_Cal_LD ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Cal_LD
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0            ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6            ; invalidate BTB
 ]
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "pc"


MMU_Changing_WB_Cal_LD ROUT
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "pc"

MMU_ChangingEntry_WB_Cal_LD ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1          ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1           ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate BTB
 ]
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        CPWAIT
        Pull    "a2, pc"


MMU_ChangingEntries_WB_Cal_LD ROUT
;
;same comments as MMU_ChangingEntry_WB_Cal_LD
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
 ]
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0              ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6              ; invalidate BTB
 ]
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        CPWAIT
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Cal_LD ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        SUBS    lr, lr, #1
        ADD     a1, a1, #PageSize
        BNE     %BT10
        CPWAIT
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr

 [ MEMM_Type = "VMSAv6" ; Need appropriate myIMB, etc. implementations if this is to be removed

; --------------------------------------------------------------------------
; ----- ARMops for Cortex-A8 and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with
; register 7, and (potentially) multiple cache levels
;
; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length
; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length
; DCache_RangeThreshold = clean threshold for data cache
; Cache_Lx_Info = Cache level ID register
; Cache_Lx_DTable = Cache size identification register for all 8 data/unified caches
; Cache_Lx_ITable = Cache size identification register for all 8 instruction caches

Cache_CleanAll_WB_CR7_Lx ROUT
; Clean cache by traversing all sets and ways for all data caches
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c10, 2 ; Clean
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        myDSB   ,a1 ; Wait for cache cleaning to complete
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_CleanInvalidateAll_WB_CR7_Lx ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c14, 2 ; Clean & invalidate
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for cache clean to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_InvalidateAll_WB_CR7_Lx ROUT
;
; no clean, assume caller knows what's happening
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     a1, a1, #&FF000000 ; Discard unification/coherency bits
        MOV     a2, #0 ; Current cache level
20
        TST     a1, #7 ; Get flags
        BEQ     %FT10 ; Cache clean complete
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c6, 2 ; Invalidate
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        MOVS    a1, a1, LSR #3
        BNE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for invalidation to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"


Cache_RangeThreshold_WB_CR7_Lx ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


MMU_ChangingUncached_WB_CR7_Lx
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
TLB_InvalidateAll_WB_CR7_Lx ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_WB_CR7_Lx
      [ NoARMv7
        Push    "a2"
        myDSB   ,a2    ; Ensure the page table write has actually completed
        myISB   ,a2,,y ; Also required
        Pull    "a2"
      |
        myDSB
        myISB
      ]
TLB_InvalidateEntry_WB_CR7_Lx ROUT
        MCR     p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_Lx ROUT
        myDSB   ,a1    ; DSB is the new name for write buffer draining
        myISB   ,a1,,y ; Also do ISB for extra paranoia
        MOV     pc, lr


IMB_Full_WB_CR7_Lx ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor
; Luckily, we only need to clean as far as the level of unification
;
        Push    "a2,a3,a4,v1,v2,v3,v4,v5,lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
        MOV     a1, a1, LSR #27
        AND     a1, a1, #&7 ; Get level of unification
        MOV     a2, #0 ; Current cache level
        SUBS    a1, a1, #1
        BLT     %FT10 ; Cache clean complete
20
        LDR     a3, [lr], #4 ; Get size info
        AND     v1, a3, #&7 ; log2(Line size)-2
        BIC     a3, a3, #&F0000007 ; Clear flags & line size
        MOV     v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits
        MOV     v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits
        ; Way number needs to be packed right up at the high end of the data word; shift it up
        CLZ     a4, v2
        MOV     v2, v2, LSL a4
        ; Set number needs to start at log2(Line size)+2
        MOV     v3, v3, LSL #4 ; Start at bit 4
        MOV     v3, v3, LSL v1 ; Start at log2(Line size)+2
        ; Now calculate the offset numbers we will use to increment sets & ways
        BIC     v4, v2, v2, LSL #1 ; Way increment
        BIC     v5, v3, v3, LSL #1 ; Set increment
        ; Now we can finally clean this cache!
        ORR     a3, a2, v3 ; Current way (0), set (max), and level
30
        MCR     p15, 0, a3, c7, c10, 2 ; Clean
        ADDS    a3, a3, v4 ; Increment way
        BCC     %BT30 ; Overflow will occur once ways are enumerated
        TST     a3, v3 ; Are set bits all zero?
        SUBNE   a3, a3, v5 ; No, so decrement set and loop around again
        BNE     %BT30
        ; This cache is now clean. Move on to the next level.
        ADD     a2, a2, #2
        SUBS    a1, a1, #1
        BGE     %BT20
10
        MOV     a1, #0
        myDSB   ,a1,,y                ; Wait for clean to complete
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a2,a3,a4,v1,v2,v3,v4,v5,pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_Lx ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take
        ADD     a2, a2, a1
        CMPLO   a1, a2 ; The routine below will fail if the end address wraps around, so just IMB_Full instead
        BHS     IMB_Full_WB_CR7_Lx
        Push    "a1,a3,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a3, #4
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c11, 1           ; clean DCache entry by VA to PoU
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        myDSB   ,a1  ; Wait for clean to complete
        Pull    "a1" ; Get start address back
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a3,pc"

MMU_Changing_WB_CR7_Lx ROUT
        Push    "lr"
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_Lx ROUT
        Push    "a2, lr"
        myDSB   ,lr ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a2, #4
        MOV     lr, a2, LSL lr
        ADD     a2, a1, #PageSize
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,lr ; Wait for clean to complete
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     lr, a1, LSL lr
        SUB     a1, a2, #PageSize ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c7, 1           ; invalidate DTLB and ITLB
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_Lx ROUT
        Push    "a2, a3, lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     lr, #4
        MOV     a3, lr, LSL a3
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,a3 ; Wait for clean to complete
        LDR     a3, =ZeroPage
        LDRB    a3, [a3, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     a3, a1, LSL a3
        MOV     a1, lr ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
20
        MCR     p15, 0, lr, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     lr, lr, #PageSize
        CMP     lr, a2
        BNE     %BT20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT
        Push    "a2,lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        CMP     a2, #32                            ; arbitrary-ish threshold
        MCRHS   p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        BHS     %FT20
10
        MCR     p15, 0, a1, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,lr,,y
        myISB   ,lr,,y
        Pull    "a2,pc"

 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------


;        IMPORT  Write0_Translated

ARM_PrintProcessorType
        LDR     a1, =ZeroPage
        LDRB    a1, [a1, #ProcessorType]
        TEQ     a1, #ARMunk
        MOVEQ   pc, lr

        Push    "lr"
        ADR     a2, PNameTable
        LDHA    a1, a2, a1, a3
        ADD     a1, a2, a1
        BL      Write0_Translated
        SWI     XOS_NewLine
        SWI     XOS_NewLine
        Pull    "pc"

PNameTable
        DCW     PName_ARM600    - PNameTable
        DCW     PName_ARM610    - PNameTable
        DCW     PName_ARM700    - PNameTable
        DCW     PName_ARM710    - PNameTable
        DCW     PName_ARM710a   - PNameTable
        DCW     PName_SA110     - PNameTable      ; pre rev T
        DCW     PName_SA110     - PNameTable      ; rev T or later
        DCW     PName_ARM7500   - PNameTable
        DCW     PName_ARM7500FE - PNameTable
        DCW     PName_SA1100    - PNameTable
        DCW     PName_SA1110    - PNameTable
        DCW     PName_ARM720T   - PNameTable
        DCW     PName_ARM920T   - PNameTable
        DCW     PName_ARM922T   - PNameTable
        DCW     PName_X80200    - PNameTable
        DCW     PName_X80321    - PNameTable
        DCW     PName_Cortex_A8 - PNameTable
	DCW	PName_Cortex_A9 - PNameTable
        DCW     PName_ARM1176JZF_S - PNameTable

PName_ARM600
        =       "600:ARM 600 Processor",0
PName_ARM610
        =       "610:ARM 610 Processor",0
PName_ARM700
        =       "700:ARM 700 Processor",0
PName_ARM710
        =       "710:ARM 710 Processor",0
PName_ARM710a
        =       "710a:ARM 710a Processor",0
PName_SA110
        =       "SA110:SA-110 Processor",0
PName_ARM7500
        =       "7500:ARM 7500 Processor",0
PName_ARM7500FE
        =       "7500FE:ARM 7500FE Processor",0
PName_SA1100
        =       "SA1100:SA-1100 Processor",0
PName_SA1110
        =       "SA1110:SA-1110 Processor",0
PName_ARM720T
        =       "720T:ARM 720T Processor",0
PName_ARM920T
        =       "920T:ARM 920T Processor",0
PName_ARM922T
        =       "922T:ARM 922T Processor",0
PName_X80200
        =       "X80200:80200 Processor",0
PName_X80321
        =       "X80321:80321 Processor",0
PName_Cortex_A8
        =       "CortexA8:Cortex-A8 Processor",0
PName_Cortex_A9
	=	"CortexA9:Cortex-A9 Processor",0
PName_ARM1176JZF_S
        =       "ARM1176JZF_S:ARM1176JZF-S Processor",0
        ALIGN


; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0)
; to XCB bits in page table descriptors.

XCB_NB  *       1:SHL:0
XCB_NC  *       1:SHL:1
XCB_P   *       1:SHL:2

        ALIGN 32

; WT read-allocate cache (eg ARM720T)
XCBTableWT                                      ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B, L2_C, L2_B, 0              ;        Default
        = L2_C+L2_B, L2_C, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X

; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB
XCBTableSA110
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T)
XCBTableWBR
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        = L2_C     ,    0, L2_B, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; SA-1110 - WB only read allocate cache, merging WB, mini D-cache
XCBTableSA1110
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0,    0, 0              ; WT,         X,  Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      X,  Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  X,           X
        = L2_C     ,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to read-allocate
XCBTableXScaleRA
        =      L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         X,  Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      X,  Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  X,           X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to write-allocate
XCBTableXScaleWA
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         X,  Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      X,  Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  X,           X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

        END