; Copyright 2000 Pace Micro Technology plc
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;     http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;
  ;      GET     Hdr:ListOpts
  ;      GET     Hdr:Macros
  ;      GET     Hdr:System
  ;      $GetCPU
  ;      $GetMEMM

  ;      GET     hdr.Options

  ;      GET     Hdr:PublicWS
  ;      GET     Hdr:KernelWS

  ;      GET     hdr.Copro15ops
  ;      GET     hdr.ARMops

v7      RN      10

  ;      EXPORT  Init_ARMarch
  ;      EXPORT  ARM_Analyse
  ;      EXPORT  ARM_PrintProcessorType

 ;       AREA    KernelCode,CODE,READONLY

; ARM keep changing their mind about ID field layout.
; Here's a summary, courtesy of the ARM ARM (v5):
;
; pre-ARM 7:   xxxx0xxx
; ARM 7:       xxxx7xxx where bit 23 indicates v4T/~v3
; post-ARM 7:  xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T)
;

; int Init_ARMarch(void)
; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later.
; Corrupts only ip, no RAM usage.
Init_ARMarch
        ARM_read_ID ip
        ANDS    a1, ip, #&0000F000
        MOVEQ   pc, lr                          ; ARM 3 or ARM 6
        TEQ     a1, #&00007000
        BNE     %FT20
        TST     ip, #&00800000                  ; ARM 7 - check for Thumb
        MOVNE   a1, #ARMv4T
        MOVEQ   a1, #ARMv3
        MOV     pc, lr
20      ANDS    a1, ip, #&000F0000              ; post-ARM 7
        MOV     a1, a1, LSR #16
        MOV     pc, lr



ARM_Analyse
        MOV     a2, lr
        BL      Init_ARMarch
        MOV     lr, a2
 [ MEMM_Type = "VMSAv6"
        CMP     a1, #ARMvF
        BEQ     ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup
 ]
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        ARM_read_cachetype v2
        LDR     v6, =ZeroPage

        ADRL    v7, KnownCPUTable
FindARMloop
        LDMIA   v7!, {a1, a2}                   ; See if it's a known ARM
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     FindARMloop
        TEQ     v2, v1                          ; If we don't have cache attributes, read from table
        LDREQ   v2, [v7]

20      TEQ     v2, v1
        BEQ     %BT20                           ; Cache unknown: panic

        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        ASSERT  CT_Isize_pos = 0
        MOV     a1, v2
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        [ No26bitCode
        MOV     v5, #CPUFlag_32bitOS
        |
        MOV     v5, #0
        ]
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        TST     v2, #CT_S
        ORRNE   v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas

        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        ; Test abort timing (base restored or base updated)
        MOV     a1, #&8000
        LDR     a2, [a1], #4                    ; Will abort - DAb handler will continue execution
        TEQ     a1, #&8000
        ORREQ   v5, v5, #CPUFlag_BaseRestored

        ; Check store of PC
30      STR     pc, [sp, #-4]!
        ADR     a2, %BT30 + 8
        LDR     a1, [sp], #4
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_StorePCplus8

        [ 0=1
        ; Check whether 26-bit mode is available
        MSR     CPSR_c, #F32_bit+I32_bit+SVC26_mode
        MRS     a1, CPSR
        AND     a1, a1, #M32_bits
        TEQ     a1, #SVC26_mode
        ORRNE   v5, v5, #CPUFlag_No26bitMode
        MSREQ   CPSR_c, #F32_bit+I32_bit+SVC32_mode
        BNE     %FT35

        ; Do we get vector exceptions on read?
        LDR     a2, =ZeroPage
        MOV     a1, a2
        LDR     a1, [a1]                        ; If this aborts a1 will be left unchanged
        TEQ     a1, a2
        ORREQ   v5, v5, #CPUFlag_VectorReadException
        ]
35

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        TEQ     a1, #ARMv3                      ; assume long multiply available
        ORRNE   v5, v5, #CPUFlag_LongMul        ; if v4 or later
        TEQNE   a1, #ARMv4                      ; assume 26-bit available
        ORRNE   v5, v5, #CPUFlag_No26bitMode    ; iff v3 or v4 (not T)
        TEQNE   a1, #ARMv5                      ; assume Thumb available
        ORRNE   v5, v5, #CPUFlag_Thumb          ; iff not v3,v4,v5

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

 [ XScaleJTAGDebug
        TST     v5, #CPUFlag_XScale
        BEQ     %FT40

        MRC     p14, 0, a2, c10, c0             ; Read debug control register
        TST     a2, #&80000000
        ORRNE   v5, v5, #CPUFlag_XScaleJTAGconnected
        MOVEQ   a2, #&C000001C                  ; enable hot debug
        MCREQ   p14, 0, a2, c10, c0
        BNE     %FT40
40
 ]

        STR     v5, [v6, #ProcessorFlags]

        ; Now, a1 = processor architecture (ARMv3, ARMv4 ...)
        ;      v4 = processor type (ARM600, ARM610, ...)
        ;      v5 = processor flags

        LDRB    a2, [v6, #Cache_Type]

 [ MEMM_Type = "ARM600"
        CMP     a1, #ARMv4
        BLO     Analyse_ARMv3                   ; eg. ARM710

        TEQ     a2, #CT_ctype_WT
        TSTEQ   v5, #CPUFlag_SplitCache
        BEQ     Analyse_WriteThroughUnified     ; eg. ARM7TDMI derivative

        TEQ     a2, #CT_ctype_WB_Crd
        BEQ     Analyse_WB_Crd                  ; eg. StrongARM

        TEQ     a2, #CT_ctype_WB_Cal_LD
        BEQ     Analyse_WB_Cal_LD               ; assume XScale
 ] ; MEMM_Type = "ARM600"

        TEQ     a2, #CT_ctype_WB_CR7_LDa
        BEQ     Analyse_WB_CR7_LDa              ; eg. ARM9

        ; others ...

WeirdARMPanic
        B       WeirdARMPanic                   ; stiff :)

 [ MEMM_Type = "ARM600"
Analyse_ARMv3
        ADRL    a1, NullOp
        ADRL    a2, Cache_Invalidate_ARMv3
        ADRL    a3, WriteBuffer_Drain_ARMv3
        ADRL    a4, TLB_Invalidate_ARMv3
        ADRL    ip, TLB_InvalidateEntry_ARMv3

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_ARMv3
        ADRL    a2, MMU_ChangingEntry_ARMv3
        ADRL    a3, MMU_ChangingUncached_ARMv3
        ADRL    a4, MMU_ChangingUncachedEntry_ARMv3
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_ARMv3
        ADRL    a2, MMU_ChangingUncachedEntries_ARMv3
        ADRL    a3, Cache_RangeThreshold_ARMv3
        ADRL    a4, Cache_Examine_Simple
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]
        STR     a4, [v6, #Proc_Cache_Examine]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WriteThroughUnified
        ADRL    a1, NullOp
        ADRL    a2, Cache_InvalidateUnified
        TST     v5, #CPUFlag_NoWBDrain
        ADRNEL  a3, WriteBuffer_Drain_OffOn
        ADREQL  a3, WriteBuffer_Drain
        ADRL    a4, TLB_Invalidate_Unified
        ADRL    ip, TLB_InvalidateEntry_Unified

        STR     a1, [v6, #Proc_Cache_CleanAll]
        STR     a2, [v6, #Proc_Cache_CleanInvalidateAll]
        STR     a2, [v6, #Proc_Cache_InvalidateAll]
        STR     a3, [v6, #Proc_WriteBuffer_Drain]
        STR     a4, [v6, #Proc_TLB_InvalidateAll]
        STR     ip, [v6, #Proc_TLB_InvalidateEntry]
        STR     a1, [v6, #Proc_IMB_Full]
        STR     a1, [v6, #Proc_IMB_Range]
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_Writethrough
        ADRL    a2, MMU_ChangingEntry_Writethrough
        ADRL    a3, MMU_ChangingUncached
        ADRL    a4, MMU_ChangingUncachedEntry
        STR     a1, [v6, #Proc_MMU_Changing]
        STR     a2, [v6, #Proc_MMU_ChangingEntry]
        STR     a3, [v6, #Proc_MMU_ChangingUncached]
        STR     a4, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_Writethrough
        ADRL    a2, MMU_ChangingUncachedEntries
        ADRL    a3, Cache_RangeThreshold_Writethrough
        ADRL    a4, Cache_Examine_Simple
        STR     a1, [v6, #Proc_MMU_ChangingEntries]
        STR     a2, [v6, #Proc_MMU_ChangingUncachedEntries]
        STR     a3, [v6, #Proc_Cache_RangeThreshold]
        STR     a4, [v6, #Proc_Cache_Examine]

        ADRL    a1, XCBTableWT
        STR     a1, [v6, #MMU_PCBTrans]
        B       %FT90
 ] ; MEMM_Type = "ARM600"

Analyse_WB_CR7_LDa
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here (eg. ARM920)

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_LDa
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, Cache_Examine_Simple
        STR     a1, [v6, #Proc_Cache_Examine]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_LDa
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, IMB_List_WB_CR7_LDa
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_LDa
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDRB    a2, [v6, #DCache_Associativity]

        MOV     a3, #256
        MOV     a4, #8           ; to find log2(ASSOC), rounded up
Analyse_WB_CR7_LDa_L1
        MOV     a3, a3, LSR #1
        SUB     a4, a4, #1
        CMP     a2, a3
        BLO     Analyse_WB_CR7_LDa_L1
        ADDHI   a4, a4, #1

        RSB     a2, a4, #32
        MOV     a3, #1
        MOV     a3, a3, LSL a2
        STR     a3, [v6, #DCache_IndexBit]
        LDR     a4, [v6, #DCache_NSets]
        LDRB    a2, [v6, #DCache_LineLen]
        SUB     a4, a4, #1
        MUL     a4, a2, a4
        STR     a4, [v6, #DCache_IndexSegStart]

        MOV     a2, #64*1024                         ; arbitrary-ish
        STR     a2, [v6, #DCache_RangeThreshold]

 [ MEMM_Type = "ARM600"
        ADRL    a1, XCBTableWBR                      ; assume read-allocate WB/WT cache
 |
        ADRL    a1, XCBTableVMSAv6
 ]
        STR     a1, [v6, #MMU_PCBTrans]

        B       %FT90

 [ MEMM_Type = "ARM600"
Analyse_WB_Crd
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Crd
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, Cache_Examine_Simple
        STR     a1, [v6, #Proc_Cache_Examine]

        ADRL    a1, TLB_InvalidateAll_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Crd
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Crd
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Crd
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Crd
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, IMB_List_WB_Crd
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_WB_Crd
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Crd
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]
        MOV     a2, #64*1024                       ;arbitrary-ish threshold
        STR     a2, [v6, #DCache_RangeThreshold]

        LDRB    a2, [v6, #ProcessorType]
        TEQ     a2, #SA110
        ADREQL  a2, XCBTableSA110
        BEQ     Analyse_WB_Crd_finish
        TEQ     a2, #SA1100
        TEQNE   a2, #SA1110
        ADREQL  a2, XCBTableSA1110
        ADRNEL  a2, XCBTableWBR
Analyse_WB_Crd_finish
        STR     a2, [v6, #MMU_PCBTrans]
        B       %FT90

Analyse_WB_Cal_LD
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard

        ADRL    a1, Cache_CleanInvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_Cal_LD
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, Cache_Examine_Simple
        STR     a1, [v6, #Proc_Cache_Examine]

        ADRL    a1, TLB_InvalidateAll_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_Cal_LD
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, IMB_List_WB_Cal_LD
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_Cal_LD
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        LDR     a2, =DCacheCleanAddress
        STR     a2, [v6, #DCache_CleanBaseAddress]
        STR     a2, [v6, #DCache_CleanNextAddress]

  [ XScaleMiniCache
        !       1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable"
        LDR     a2, =DCacheCleanAddress + 4 * 32*1024
        STR     a2, [v6, #MCache_CleanBaseAddress]
        STR     a2, [v6, #MCache_CleanNextAddress]
  ]


  ; arbitrary-ish values, mini cache makes global op significantly more expensive
  [ XScaleMiniCache
        MOV     a2, #128*1024
  |
        MOV     a2, #32*1024
  ]
        STR     a2, [v6, #DCache_RangeThreshold]

        ; enable full coprocessor access
        LDR     a2, =&3FFF
        MCR     p15, 0, a2, c15, c1

        LDR     a2, [v6, #ProcessorFlags]
        TST     a2, #CPUFlag_ExtendedPages
        ADREQL  a2, XCBTableXScaleNoExt
        ADRNEL  a2, XCBTableXScaleWA ; choose between RA and WA here
        STR     a2, [v6, #MMU_PCBTrans]

        B       %FT90
 ] ; MEMM_Type = "ARM600"

 [ MEMM_Type = "VMSAv6"
Analyse_WB_CR7_Lx
        TST     v5, #CPUFlag_SplitCache
        BEQ     WeirdARMPanic             ; currently, only support harvard caches here

        ; Read smallest instruction & data/unified cache line length
        MRC     p15, 0, a1, c0, c0, 1 ; Cache type register
        MOV     v2, a1, LSR #16
        AND     a4, a1, #&F
        AND     v2, v2, #&F
        STRB    a4, [v6, #ICache_LineLen] ; Store log2(line size)-2
        STRB    v2, [v6, #DCache_LineLen] ; log2(line size)-2

        ; Read the cache info into Cache_Lx_*
        MRC     p15, 1, a1, c0, c0, 1 ; Cache level ID register
        MOV     v2, v6 ; Work around DTable/ITable alignment issues
        STR     a1, [v2, #Cache_Lx_Info]!
        ADD     a2, v2, #Cache_Lx_DTable-Cache_Lx_Info
        MOV     a3, #0
10
        ANDS    v1, a1, #6 ; Data or unified cache at this level?
        MCRNE   p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        myISB   ,v1
        MRCNE   p15, 1, v1, c0, c0, 0 ; Get size info (data/unified)
        STR     v1, [a2]
        ADD     a3, a3, #1
        ANDS    v1, a1, #1 ; Instruction cache at this level?
        MCRNE   p15, 2, a3, c0, c0, 0 ; Program cache size selection register
        myISB   ,v1
        MRCNE   p15, 1, v1, c0, c0, 0 ; Get size info (instruction)
        STR     v1, [a2, #Cache_Lx_ITable-Cache_Lx_DTable]
        ; Shift the cache level ID register along to get the type of the next
        ; cache level
        ; However, we need to stop once we reach the first blank entry, because
        ; ARM have been sneaky and started to reuse some of the bits from the
        ; high end of the register (the Cortex-A8 TRM lists bits 21-23 as being
        ; for cache level 8, but the ARMv7 ARM lists them as being for the level
        ; of unification for inner shareable memory). The ARMv7 ARM does warn
        ; about making sure you stop once you find the first blank entry, but
        ; it doesn't say why!
        TST     a1, #7
        ADD     a3, a3, #1
        MOVNE   a1, a1, LSR #3
        CMP     a3, #14 ; Stop after level 7 (even though an 8th level might exist on some CPUs?)
        ADD     a2, a2, #4
        BLT     %BT10

        ; Calculate DCache_RangeThreshold
        MOV     a1, #128*1024 ; Arbitrary-ish
        STR     a1, [v6, #DCache_RangeThreshold]

        ADRL    a1, Cache_CleanInvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanInvalidateAll]

        ADRL    a1, Cache_CleanAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_CleanAll]

        ADRL    a1, Cache_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_InvalidateAll]

        ADRL    a1, Cache_RangeThreshold_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_RangeThreshold]

        ADRL    a1, Cache_Examine_WB_CR7_Lx
        STR     a1, [v6, #Proc_Cache_Examine]

        ADRL    a1, TLB_InvalidateAll_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateAll]

        ADRL    a1, TLB_InvalidateEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_TLB_InvalidateEntry]

        ADRL    a1, WriteBuffer_Drain_WB_CR7_Lx
        STR     a1, [v6, #Proc_WriteBuffer_Drain]

        ADRL    a1, IMB_Full_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Full]

        ADRL    a1, IMB_Range_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_Range]

        ADRL    a1, IMB_List_WB_CR7_Lx
        STR     a1, [v6, #Proc_IMB_List]

        ADRL    a1, MMU_Changing_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_Changing]

        ADRL    a1, MMU_ChangingEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntry]

        ADRL    a1, MMU_ChangingUncached_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncached]

        ADRL    a1, MMU_ChangingUncachedEntry_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntry]

        ADRL    a1, MMU_ChangingEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingEntries]

        ADRL    a1, MMU_ChangingUncachedEntries_WB_CR7_Lx
        STR     a1, [v6, #Proc_MMU_ChangingUncachedEntries]

        ADRL    a1, XCBTableVMSAv6
        STR     a1, [v6, #MMU_PCBTrans]

        B       %FT90
 ] ; MEMM_Type = "VMSAv6"

90
        Pull    "v1,v2,v5,v6,v7,pc"


; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined
; in section B2.3.3 of the ARMv5 ARM.
EvaluateCache
        AND     a3, a1, #CT_assoc_mask+CT_M
        TEQ     a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M
        BEQ     %FT80
        MOV     ip, #1
        ASSERT  CT_len_pos = 0
        AND     a4, a1, #CT_len_mask
        ADD     a4, a4, #3
        MOV     a4, ip, LSL a4                  ; LineLen = 1 << (len+3)
        STRB    a4, [a2, #ICache_LineLen-ICache_Info]
        MOV     a3, #2
        TST     a1, #CT_M
        ADDNE   a3, a3, #1                      ; Multiplier = 2 + M
        AND     a4, a1, #CT_assoc_mask
        RSB     a4, ip, a4, LSR #CT_assoc_pos
        MOV     a4, a3, LSL a4                  ; Associativity = Multiplier << (assoc-1)
        STRB    a4, [a2, #ICache_Associativity-ICache_Info]
        AND     a4, a1, #CT_size_mask
        MOV     a4, a4, LSR #CT_size_pos
        MOV     a3, a3, LSL a4
        MOV     a3, a3, LSL #8                  ; Size = Multiplier << (size+8)
        STR     a3, [a2, #ICache_Size-ICache_Info]
        ADD     a4, a4, #6
        AND     a3, a1, #CT_assoc_mask
        SUB     a4, a4, a3, LSR #CT_assoc_pos
        AND     a3, a1, #CT_len_mask
        ASSERT  CT_len_pos = 0
        SUB     a4, a4, a3
        MOV     a4, ip, LSL a4                  ; NSets = 1 << (size + 6 - assoc - len)
        STR     a4, [a2, #ICache_NSets-ICache_Info]
        MOV     pc, lr


80      MOV     a1, #0
        STR     a1, [a2, #ICache_NSets-ICache_Info]
        STR     a1, [a2, #ICache_Size-ICache_Info]
        STRB    a1, [a2, #ICache_LineLen-ICache_Info]
        STRB    a1, [a2, #ICache_Associativity-ICache_Info]
        MOV     pc, lr


; Create a list of CPUs, 16 bytes per entry:
;    ID bits (1 word)
;    Test mask for ID (1 word)
;    Cache type register value (1 word)
;    Processor type (1 byte)
;    Architecture type (1 byte)
;    Reserved (2 bytes)
        GBLA    tempcpu

        MACRO
        CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln
        LCLA    type
type    SETA    (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos)
tempcpu CSzDesc $dsz, $das, $dln
type    SETA    type+(tempcpu:SHL:CT_Dsize_pos)
        [ :LNOT:($s=0 :LAND: "$isz"="")
tempcpu CSzDesc $isz, $ias, $iln
        ]
type    SETA    type+(tempcpu:SHL:CT_Isize_pos)
        ASSERT  ($id :AND: :NOT: $mask) = 0
        DCD     $id, $mask, type
        DCB     $proc, $arch, 0, 0
        MEND

        MACRO
$var    CSzDesc $sz, $as, $ln
$var    SETA    (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos)
$var    SETA    $var+(CT_M_$sz:SHL:CT_M_pos)
        MEND


; CPUDesc table for ARMv3-ARMv6
KnownCPUTable
;                                                        /------Cache Type register fields-----\.
;                              ID reg   Mask     Arch    Type         S  Dsz Das Dln Isz Ias Iln
 [ MEMM_Type = "ARM600"
        CPUDesc ARM600,        &000600, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM610,        &000610, &00FFF0, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARMunk,        &000000, &00F000, ARMv3,   WT,         0,  4K, 64, 4
        CPUDesc ARM700,        &007000, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710,        &007100, &FFFFF0, ARMv3,   WT,         0,  8K,  4, 8
        CPUDesc ARM710a,       &047100, &FDFFF0, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM7500,       &027100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARM7500FE,     &077100, &FFFFF0, ARMv3,   WT,         0,  4K,  4, 4
        CPUDesc ARMunk,        &007000, &80F000, ARMv3,   WT,         0,  8K,  4, 4
        CPUDesc ARM720T,       &807200, &FFFFF0, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc ARMunk,        &807000, &80F000, ARMv4T,  WT,         0,  8K,  4, 4
        CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA110,         &01A100, &0FFFF0, ARMv4,   WB_Crd,     1, 16K, 32, 8, 16K, 32, 8
        CPUDesc SA1100,        &01A110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc SA1110,        &01B110, &0FFFF0, ARMv4,   WB_Crd,     1,  8K, 32, 8, 16K, 32, 8
        CPUDesc ARM920T,       &029200, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8
        CPUDesc ARM922T,       &029220, &0FFFF0, ARMv4T,  WB_CR7_LDa, 1,  8K, 64, 8,  8K, 64, 8
        CPUDesc X80200,        &052000, &0FFFF0, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
        CPUDesc X80321,    &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD,  1, 32K, 32, 8, 32K, 32, 8
 ] ; MEMM_Type = "ARM600"
        DCD     -1

 [ MEMM_Type = "VMSAv6"
; Simplified CPUDesc table for ARMvF
; The cache size data is ignored for ARMv7.
KnownCPUTable_Fancy
        CPUDesc ARM1176JZF_S,  &00B760, &00FFF0, ARMvF,   WB_CR7_LDc, 1, 16K,  4, 8, 16K,  4, 8
        CPUDesc Cortex_A5,     &00C050, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 16K, 32,16, 16K, 32,16
        CPUDesc Cortex_A7,     &00C070, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 16K, 32,16, 16K, 32,16
        CPUDesc Cortex_A8,     &00C080, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 16K, 32,16, 16K, 32,16
        CPUDesc Cortex_A9,     &00C090, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 32K, 32,16, 32K, 32,16
        CPUDesc Cortex_A12,    &00C0D0, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 32K, 32,16, 32K, 32,16
        CPUDesc Cortex_A15,    &00C0F0, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 32K, 32,16, 32K, 32,16
        CPUDesc Cortex_A17,    &00C0E0, &00FFF0, ARMvF,   WB_CR7_Lx,  1, 32K, 32,16, 32K, 32,16
        DCD     -1
 ] ; MEMM_Type = "VMSAv6"

; Peculiar characteristics of individual ARMs not deducable otherwise. First field is
; flags to set, second flags to clear.
KnownCPUFlags
        DCD     0,                            0    ; ARM 600
        DCD     0,                            0    ; ARM 610
        DCD     0,                            0    ; ARM 700
        DCD     0,                            0    ; ARM 710
        DCD     0,                            0    ; ARM 710a
        DCD     CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay,   0    ; SA 110 pre revT
        DCD     CPUFlag_InterruptDelay,       0    ; SA 110 revT or later
        DCD     0,                            0    ; ARM 7500
        DCD     0,                            0    ; ARM 7500FE
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1100
        DCD     CPUFlag_InterruptDelay,       0    ; SA 1110
        DCD     CPUFlag_NoWBDrain,            0    ; ARM 720T
        DCD     0,                            0    ; ARM 920T
        DCD     0,                            0    ; ARM 922T
        DCD     CPUFlag_ExtendedPages+CPUFlag_XScale,  0    ; X80200
        DCD     CPUFlag_XScale,               0    ; X80321
        DCD     0,                            0    ; ARM1176JZF_S
        DCD     0,                            0    ; Cortex_A5
        DCD     0,                            0    ; Cortex_A7
        DCD     0,                            0    ; Cortex_A8
        DCD     0,                            0    ; Cortex_A9
        DCD     0,                            0    ; Cortex_A12
        DCD     0,                            0    ; Cortex_A15
        DCD     0,                            0    ; Cortex_A17

 [ MEMM_Type = "VMSAv6"
; --------------------------------------------------------------------------
; ----- ARM_Analyse_Fancy --------------------------------------------------
; --------------------------------------------------------------------------
;
; For ARMv7 ARMs (arch=&F), we can detect everything via the feature registers
; TODO - There's some stuff in here that can be tidied up/removed

; Things we need to set up:
; ProcessorType     (as listed in hdr.ARMops)
; Cache_Type        (CT_ctype_* from hdr:MEMM.ARM600)
; ProcessorArch     (as reported by Init_ARMarch)
; ProcessorFlags    (CPUFlag_* from hdr.ARMops)
; Proc_*            (Cache/TLB/IMB/MMU function pointers)
; MMU_PCBTrans      (Points to lookup table for translating page table cache options)
; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?)

ARM_Analyse_Fancy
        Push    "v1,v2,v5,v6,v7,lr"
        ARM_read_ID v1
        LDR     v6, =ZeroPage
        ADRL    v7, KnownCPUTable_Fancy
10
        LDMIA   v7!, {a1, a2}
        CMP     a1, #-1
        BEQ     %FT20
        AND     a2, v1, a2
        TEQ     a1, a2
        ADDNE   v7, v7, #8
        BNE     %BT10
20
        LDR     v2, [v7]
        CMP     a1, #-1
        LDRNEB  a2, [v7, #4]
        MOVEQ   a2, #ARMunk
        STRB    a2, [v6, #ProcessorType]

        AND     a1, v2, #CT_ctype_mask
        MOV     a1, a1, LSR #CT_ctype_pos
        STRB    a1, [v6, #Cache_Type]

        ; STM should always store PC+8
        ; Should always be base restored abort model
        ; 26bit has been obsolete for a long time
        MOV     v5, #CPUFlag_StorePCplus8+CPUFlag_BaseRestored+CPUFlag_32bitOS+CPUFlag_No26bitMode
        [ HiProcVecs
        ORR     v5, v5, #CPUFlag_HiProcVecs
        ]

        ; Work out whether the cache info is in ARMv6 or ARMv7 style
        ; Top 3 bits of the cache type register give the register format
        ARM_read_cachetype v2
        MOV     a1, v2, LSR #29
        TEQ     a1, #4
        BEQ     %FT25
        TEQ     a1, #0
        BNE     WeirdARMPanic

        ; ARMv6 format cache type register.
        ; CPUs like the ARM1176JZF-S are available with a range of cache sizes,
        ; so it's not safe to rely on the values in the CPU table. Fortunately
        ; all ARMv6 CPUs implement the register (by contrast, for the "plain"
        ; ARM case, no ARMv3 CPUs, some ARMv4 CPUs and all ARMv5 CPUs, so it
        ; needs to drop back to the table in some cases).
        MOV     a1, v2, LSR #CT_Isize_pos
        ADD     a2, v6, #ICache_Info
        BL      EvaluateCache
        MOV     a1, v2, LSR #CT_Dsize_pos
        ADD     a2, v6, #DCache_Info
        BL      EvaluateCache

        TST     v2, #CT_S
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache

        B       %FT27

25
        ; ARMv7 format cache type register.
        ; This should(!) mean that we have the cache level ID register,
        ; and all the other ARMv7 cache registers.

        ; Do we have a split cache?
        MRC     p15, 1, a1, c0, c0, 1
        AND     a2, a1, #7
        TEQ     a2, #3
        ORREQ   v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache

27
        [ CacheOff
        ORR     v5, v5, #CPUFlag_SynchroniseCodeAreas
        |
        ARM_read_control a1                     ; if Z bit set then we have branch prediction,
        TST     a1, #MMUC_Z                     ; so we need OS_SynchroniseCodeAreas even if not
        ORRNE   v5, v5, #CPUFlag_SynchroniseCodeAreas   ; split caches
        ]

        BL      Init_ARMarch
        STRB    a1, [v6, #ProcessorArch]

        MRC     p15, 0, a1, c0, c2, 2
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_LongMul

        MRC     p15, 0, a1, c0, c1, 0
        TST     a1, #&F000
        ORRNE   v5, v5, #CPUFlag_Thumb

        MSR     CPSR_f, #Q32_bit
        MRS     lr, CPSR
        TST     lr, #Q32_bit
        ORRNE   v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this?

        ; Other flags not checked for above:
        ; CPUFlag_InterruptDelay
        ; CPUFlag_VectorReadException
        ; CPUFlag_ExtendedPages
        ; CPUFlag_NoWBDrain
        ; CPUFlag_AbortRestartBroken
        ; CPUFlag_XScale
        ; CPUFlag_XScaleJTAGconnected

        LDRB    v4, [v6, #ProcessorType]

        TEQ     v4, #ARMunk                     ; Modify deduced flags
        ADRNEL  lr, KnownCPUFlags
        ADDNE   lr, lr, v4, LSL #3
        LDMNEIA lr, {a2, a3}
        ORRNE   v5, v5, a2
        BICNE   v5, v5, a3

        STR     v5, [v6, #ProcessorFlags]

        ; Cache analysis

        LDRB    a2, [v6, #Cache_Type]

        TEQ     a2, #CT_ctype_WB_CR7_LDa        ; eg. ARM9
        TEQNE   a2, #CT_ctype_WB_CR7_LDc        ; eg. ARM1176JZF-S - differs only in cache lockdown
        BEQ     Analyse_WB_CR7_LDa

        TEQ     a2, #CT_ctype_WB_CR7_Lx
        BEQ     Analyse_WB_CR7_Lx               ; eg. Cortex-A8, Cortex-A9

        ; others ...

        B       WeirdARMPanic                   ; stiff :)
 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------
; ----- ARMops -------------------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMops are the routines required by the kernel for cache/MMU control
; the kernel vectors to the appropriate ops for the given ARM at boot
;
; The Rules:
;   - These routines may corrupt a1 and lr only
;   - (lr can of course only be corrupted whilst still returning to correct
;     link address)
;   - stack is available, at least 16 words can be stacked
;   - a NULL op would be a simple MOV pc, lr
;

; In:  r1 = cache level (0-based)
; Out: r0 = Flags
;           bits 0-2: cache type:
;              000 -> none
;              001 -> instruction
;              010 -> data
;              011 -> split
;              100 -> unified
;              1xx -> reserved
;           Other bits: reserved
;      r1 = D line length
;      r2 = D size
;      r3 = I line length
;      r4 = I size
;      r0-r4 = zero if cache level not present
Cache_Examine_Simple
        TEQ     r1, #0
        MOVNE   r0, #0
        MOVNE   r1, #0
        MOVNE   r2, #0
        MOVNE   r3, #0
        MOVNE   r4, #0
        MOVNE   pc, lr
        LDR     r4, =ZeroPage
        LDR     r0, [r4, #ProcessorFlags]
        TST     r0, #CPUFlag_SplitCache
        MOVNE   r0, #3
        MOVEQ   r0, #4
        LDRB    r1, [r4, #DCache_LineLen]
        LDR     r2, [r4, #DCache_Size]
        LDRB    r3, [r4, #ICache_LineLen]
        LDR     r4, [r4, #ICache_Size]
        MOV     pc, lr

 [ MEMM_Type = "ARM600"

; --------------------------------------------------------------------------
; ----- ARMops for ARMv3 ---------------------------------------------------
; --------------------------------------------------------------------------
;
; ARMv3 ARMs include ARM710, ARM610, ARM7500
;

Cache_Invalidate_ARMv3
        MCR     p15, 0, a1, c7, c0
NullOp  MOV     pc, lr

WriteBuffer_Drain_ARMv3
        ;swap always forces unbuffered write, stalling till WB empty
        SUB     sp, sp, #4
        SWP     a1, a1, [sp]
        ADD     sp, sp, #4
        MOV     pc, lr

TLB_Invalidate_ARMv3
        MCR     p15, 0, a1, c5, c0
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_ARMv3
        MCR     p15, 0, a1, c6, c0
        MOV     pc, lr

MMU_Changing_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached_ARMv3
        MCR     p15, 0, a1, c5, c0      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_ARMv3
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_ARMv3
        Push    "a2"
        MCR     p15, 0, a1, c7, c0      ; invalidate cache
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_ARMv3
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_ARMv3 ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached_ARMv3
        Push    "a2"
10
        MCR     p15, 0, a1, c6, c0      ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_ARMv3
        ! 0, "arbitrary Cache_RangeThreshold_ARMv3"
        MOV     a1, #16*PageSize
        MOV     pc, lr

        LTORG

; --------------------------------------------------------------------------
; ----- generic ARMops for simple ARMs, ARMv4 onwards ----------------------
; --------------------------------------------------------------------------
;
; eg. ARM7TDMI based ARMs, unified, writethrough cache
;

Cache_InvalidateUnified
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7
        MOV     pc, lr

WriteBuffer_Drain_OffOn
        ; used if ARM has no drain WBuffer MCR op
        Push    "a2"
        ARM_read_control a1
        BIC     a2, a1, #MMUC_W
        ARM_write_control a2
        ARM_write_control a1
        Pull    "a2"
        MOV     pc, lr

WriteBuffer_Drain
        ; used if ARM has proper drain WBuffer MCR op
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4
        MOV     pc, lr

TLB_Invalidate_Unified
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7
        MOV     pc, lr

; a1 = page entry to invalidate (page aligned address)
;
TLB_InvalidateEntry_Unified
        MCR     p15, 0, a1, c8, c7, 1
        MOV     pc, lr

MMU_Changing_Writethrough
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

MMU_ChangingUncached
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7      ; invalidate TLB
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_Writethrough
        Push    "a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        Pull    "a4"
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_Writethrough  ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_Changing_Writethrough
        Push    "a2,a4"
        MOV     a4, #0
        MCR     p15, 0, a4, c7, c7      ; invalidate cache
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2,a4"
        MOV     pc, lr

; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        MOV     pc, lr

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries ROUT
        CMP     a2, #16                 ; arbitrary-ish threshold
        BHS     MMU_ChangingUncached
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c7, 1   ; invalidate TLB entry
        SUBS    a2, a2, #1              ; next page
        ADD     a1, a1, #PageSize
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr

Cache_RangeThreshold_Writethrough
        ! 0, "arbitrary Cache_RangeThreshold_Writethrough"
        MOV     a1, #16*PageSize
        MOV     pc, lr

 ] ; MEMM_Type = "ARM600"

; --------------------------------------------------------------------------
; ----- ARMops for ARM9 and the like ---------------------------------------
; --------------------------------------------------------------------------

; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with
; register 7, lockdown available (format A)
;
; Note that ARM920 etc have writeback/writethrough data cache selectable
; by MMU regions. For simpliciity, we assume cacheable pages are mostly
; writeback. Any writethrough pages will have redundant clean operations
; applied when moved, for example, but this is a small overhead (cleaning
; a clean line is very quick on ARM 9).

Cache_CleanAll_WB_CR7_LDa ROUT
;
; only guarantees to clean lines not involved in interrupts (so we can
; clean without disabling interrupts)
;
; Clean cache by traversing all segment and index values
; As a concrete example, for ARM 920 (16k+16k caches) we would have:
;
;    DCache_LineLen       = 32         (32 byte cache line, segment field starts at bit 5)
;    DCache_IndexBit      = &04000000  (index field starts at bit 26)
;    DCache_IndexSegStart = &000000E0  (start at index=0, segment = 7)
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c10, 2           ; clean DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        Pull    "a2, ip"
        MOV     pc, lr


Cache_CleanInvalidateAll_WB_CR7_LDa ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "a2, ip"
        LDR     ip, =ZeroPage
        LDRB    a1, [ip, #DCache_LineLen]        ; segment field starts at this bit
        LDR     a2, [ip, #DCache_IndexBit]       ; index field starts at this bit
        LDR     ip, [ip, #DCache_IndexSegStart]  ; starting value, with index at min, seg at max
10
        MCR     p15, 0, ip, c7, c14, 2           ; clean&invalidate DCache entry by segment/index
        ADDS    ip, ip, a2                       ; next index, counting up, CS if wrapped back to 0
        BCC     %BT10
        SUBS    ip, ip, a1                       ; next segment, counting down, CC if wrapped back to max
        BCS     %BT10                            ; if segment wrapped, then we've finished
        MOV     ip, #0
        MCR     p15, 0, ip, c7, c10, 4           ; drain WBuffer
        MCR     p15, 0, ip, c7, c5, 0            ; invalidate ICache
        Pull    "a2, ip"
        MOV     pc, lr


Cache_InvalidateAll_WB_CR7_LDa ROUT
;
; no clean, assume caller knows what's happening
;
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate ICache and DCache
        MOV     pc, lr


Cache_RangeThreshold_WB_CR7_LDa ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_CR7_LDa ROUT
MMU_ChangingUncached_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
TLB_InvalidateEntry_WB_CR7_LDa ROUT
MMU_ChangingUncachedEntry_WB_CR7_LDa
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_LDa ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
        MOV     pc, lr


IMB_Full_WB_CR7_LDa ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache
;
        Push    "lr"
        BL      Cache_CleanAll_WB_CR7_LDa       ; also drains Wbuffer
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache
        Pull    "pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_LDa ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_CR7_LDa
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry by VA
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer
        MCR     p15, 0, a1, c7, c5, 6            ; flush branch predictors
        Pull    "pc"

;  a1 = pointer to list of (start, end) address pairs
;  a2 = pointer to end of list
;  a3 = total amount of memory to be synchronised
;
IMB_List_WB_CR7_LDa ROUT
        CMP     a3, #32*1024                     ; arbitrary-ish range threshold
        BHS     IMB_Full_WB_CR7_LDa
        Push    "v1-v2,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
05
        LDMIA   a1!, {v1-v2}
10
        MCR     p15, 0, v1, c7, c10, 1           ; clean DCache entry by VA
        MCR     p15, 0, v1, c7, c5, 1            ; invalidate ICache entry
        ADD     v1, v1, lr
        CMP     v1, v2
        BLO     %BT10
        CMP     a1, a2
        BNE     %BT05
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer
        MCR     p15, 0, a1, c7, c5, 6            ; flush branch predictors
        Pull    "v1-v2,pc"

MMU_Changing_WB_CR7_LDa ROUT
        Push    "lr"
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_LDa ROUT
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MOV     lr, #0
        MCR     p15, 0, lr, c7, c10, 4          ; drain WBuffer
        MCR     p15, 0, a1, c7, c5, 6           ; flush branch predictors
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_LDa ROUT
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1             ; clean&invalidate DCache entry
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MOV     a1, #0
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
        MCR     p15, 0, a1, c7, c5, 6              ; flush branch predictors
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_LDa
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "a2"
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
        Pull    "a2"
        MOV     pc, lr
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        MOV     pc, lr


 [ MEMM_Type = "ARM600"

; --------------------------------------------------------------------------
; ----- ARMops for StrongARM and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_Crd is Writeback data cache, clean by reading data from cleaner area

; Currently no support for mini data cache on some StrongARM variants. Mini
; cache is always writeback and must have cleaning support, so is very
; awkward to use for cacheable screen, say.

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
;   DCache_CleanBaseAddress   : start address of total cleaner space
;   DCache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


Cache_CleanAll_WB_Crd ROUT
;
; - cleans data cache (and invalidates it as a side effect)
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of StrongARM ops for discussion of strategy
;

        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer
        Pull    "a2-a4, v1, v2, pc"


Cache_CleanInvalidateAll_WB_Crd ROUT
IMB_Full_WB_Crd
;
;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not
;involved in interrupts - this is sufficient for OS requirements, and means we don't
;have to disable interrupts for possibly slow clean
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        Pull    "pc"

Cache_InvalidateAll_WB_Crd
;
; no clean, assume caller knows what is happening
;
        MCR     p15, 0, a1, c7, c7, 0               ;flush ICache and DCache
        MCR     p15, 0, a1, c7, c10, 4              ;drain WBuffer
        MOV     pc, lr

Cache_RangeThreshold_WB_Crd
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr

TLB_InvalidateAll_WB_Crd
MMU_ChangingUncached_WB_Crd
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr

TLB_InvalidateEntry_WB_Crd
MMU_ChangingUncachedEntry_WB_Crd
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        MOV     pc, lr

WriteBuffer_Drain_WB_Crd
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MOV     pc, lr


IMB_Range_WB_Crd ROUT
        SUB     a2, a2, a1
        CMP     a2, #64*1024                       ;arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Crd
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        Pull    "pc"


IMB_List_WB_Crd ROUT
        CMP     a3, #64*1024                       ;arbitrary-ish range threshold
        BHS     IMB_Full_WB_Crd
        Push    "v1-v2,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
05
        LDMIA   a1!, {v1-v2}
10
        MCR     p15, 0, v1, c7, c10, 1             ;clean DCache entry
        ADD     v1, v1, lr
        CMP     v1, v2
        BLO     %BT10
        CMP     a1, a2
        BNE     %BT05
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        Pull    "v1-v2,pc"

MMU_Changing_WB_Crd
        Push    "lr"
        BL      Cache_CleanAll_WB_Crd               ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0               ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0               ;flush ITLB and DTLB
        Pull    "pc"

MMU_ChangingEntry_WB_Crd ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, pc"

MMU_ChangingEntries_WB_Crd ROUT
;
;same comments as MMU_ChangingEntry_WB_Crd
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ;clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ;flush DCache entry
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ;drain WBuffer
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MOV     a1, lr                             ;restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanAll_WB_Crd              ;clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0              ;flush ICache
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Crd ROUT
        CMP     a2, #32                            ;arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ;flush DTLB entry
        ADD     a1, a1, #PageSize
        SUBS    lr, lr, #1
        BNE     %BT10
        MCR     p15, 0, a1, c8, c5, 0              ;flush ITLB
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ;flush ITLB and DTLB
        MOV     pc, lr

        LTORG

; ARMops for XScale, mjs Feb 2001
;
; WB_Cal_LD is writeback, clean with allocate, lockdown
;
; If the mini data cache is used (XScaleMiniCache true), it is assumed to be
; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow
; mini cache clean for things like IMB_Full.
;
; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the
; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts
; off (else the main cache may be slightly dirty from interrupts, and the invalidate
; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow
; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate
; (and perhaps a line allocate) for the mini cache.
;
; We do not use lockdown.
;
; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough
; pages will be invalidated as if they were writeback, but there is little overhead
; (cleaning a clean line or allocating a line from cleaner area are both fast).

; Global cache cleaning requires address space for private cleaner areas (not accessed
; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency
; hit), which means that the cleaner data is not invalidated afterwards. This is fine for
; RISC OS - where the private area is not used for anything else, and any re-use of the
; cache under interrupts is safe (eg. a page being moved is *never* involved in any
; active interrupts).

; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum
; cleaning cost while guaranteeing proper clean even if previous clean data is present. If
; the clean routine is re-entered, an independent, double sized clean is initiated. This
; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating
; the routine at all. The overhead is small, since by far the most common cleaning will be
; non-re-entered. The upshot is that the cleaner address space available must be at least 4
; times the cache size:
;   1 : used alternately, on 1st, 3rd, ... non-re-entered cleans
;   2 : used alternately, on 2nd, 4th, ... non-re-entered cleans
;   3 : used only for first half of a re-entered clean
;   4 : used only for second half of a re-entered clean
;
; If the mini cache is used, it has its own equivalent cleaner space and algorithm.
; Parameters for each cache are:
;
;    Cache_CleanBaseAddress   : start address of total cleaner space
;    Cache_CleanNextAddress   : start address for next non-re-entered clean, or 0 if re-entered


                 GBLL XScaleMiniCache  ; *must* be configured writethrough if used
XScaleMiniCache  SETL {FALSE}


; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect
; corrupts a1
;
        MACRO
        CPWAIT
        MRC      p15, 0, a1, c2, c0, 0               ; arbitrary read of CP15
        MOV      a1, a1                              ; wait for it
        ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller
        MEND


Cache_CleanAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates as a side effect)
; - if mini cache is in use, will be writethrough so no clean required
; - can be used with interrupts enabled (to avoid latency over time of clean)
; - can be re-entered
; - see remarks at top of XScale ops for discussion of strategy
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #DCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+DCache_CleanNextAddress
        LDR     a3, [lr, #DCache_Size]
        LDRB    a4, [lr, #DCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        MCR     p15, 0, v1, c7, c2, 5               ; allocate address from cleaner space
        ADD     v1, v1, a4
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Cache_Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        MCR     p15, 0, a1, c7, c10, 4              ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "a2-a4, v1, v2, pc"

  [ XScaleMiniCache

Cache_MiniInvalidateAll_WB_Cal_LD ROUT
;
; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and
; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough)
;
        Push    "a2-a4, v1, v2, lr"
        LDR     lr, =ZeroPage
        LDR     a1, [lr, #MCache_CleanBaseAddress]
        LDR     a2, =ZeroPage+MCache_CleanNextAddr
        LDR     a3, [lr, #MCache_Size]
        LDRB    a4, [lr, #MCache_LineLen]
        MOV     v2, #0
        SWP     v1, v2, [a2]                        ; read current CleanNextAddr, zero it (semaphore)
        TEQ     v1, #0                              ; but if it is already zero, we have re-entered
        ADDEQ   v1, a1, a3, LSL #1                  ; if re-entered, start clean at Base+2*Cache_Size
        ADDEQ   v2, v1, a3, LSL #1                  ; if re-entered, do a clean of 2*Cache_Size
        ADDNE   v2, v1, a3                          ; if not re-entered, do a clean of Cache_Size
10
        LDR     lr, [v1], a4                        ; read a line of cleaner data
        TEQ     v1, v2
        BNE     %BT10
        ADD     v2, a1, a3, LSL #1                  ; compare end address with Base+2*Size
        CMP     v1, v2
        MOVEQ   v1, a1                              ; if equal, not re-entered and Next wraps back
        STRLS   v1, [a2]                            ; if lower or same, not re-entered, so update Next
        ; note, no drain WBuffer, since we are really only invalidating a writethrough cache
        Pull    "a2-a4, v1, v2, pc"

  ] ; XScaleMiniCache


Cache_CleanInvalidateAll_WB_Cal_LD ROUT
;
; - cleans main cache (and invalidates wrt OS stuff as a side effect)
; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect
;
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
  [ XScaleMiniCache
        BL      Cache_MiniInvalidateAll_WB_Cal_LD
  ]
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


Cache_InvalidateAll_WB_Cal_LD ROUT
;
; no clean, assume caller knows what's happening
;
        MCR     p15, 0, a1, c7, c7, 0           ; invalidate DCache, (MiniCache), ICache and BTB
        CPWAIT
        MOV     pc, lr


Cache_RangeThreshold_WB_Cal_LD ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


TLB_InvalidateAll_WB_Cal_LD ROUT
MMU_ChangingUncached_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr


TLB_InvalidateEntry_WB_Cal_LD ROUT
MMU_ChangingUncachedEntry_WB_Cal_LD
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        CPWAIT
        MOV     pc, lr


WriteBuffer_Drain_WB_Cal_LD ROUT
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer (waits, so no need for CPWAIT)
        MOV     pc, lr


IMB_Full_WB_Cal_LD
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD             ; clean DCache (wrt to non-interrupt stuff)
        MCR     p15, 0, a1, c7, c5, 0                ; invalidate ICache and BTB
        CPWAIT
        Pull    "pc"


IMB_Range_WB_Cal_LD ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024                     ; arbitrary-ish range threshold
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_Cal_LD
        Push    "lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1           ; clean DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0            ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6            ; invalidate BTB
 ]
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "pc"


IMB_List_WB_Cal_LD ROUT
        CMP     a3, #32*1024                     ; arbitrary-ish range threshold
        BHS     IMB_Full_WB_Cal_LD
        Push    "v1-v2,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
05
        LDMIA   a1!, {v1-v2}
10
        MCR     p15, 0, v1, c7, c10, 1           ; clean DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, v1, c7, c5, 1            ; invalidate ICache entry
 ]
        ADD     v1, v1, lr
        CMP     v1, v2
        BLO     %BT10
        CMP     a1, a2
        BNE     %BT05
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0            ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6            ; invalidate BTB
 ]
        MCR     p15, 0, a1, c7, c10, 4           ; drain WBuffer (waits, so no need for CPWAIT)
        Pull    "v1-v2,pc"


MMU_Changing_WB_Cal_LD ROUT
        Push    "lr"
        BL      Cache_CleanAll_WB_Cal_LD
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
        MCR     p15, 0, a1, c8, c7, 0           ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "pc"

MMU_ChangingEntry_WB_Cal_LD ROUT
;
;there is no clean&invalidate DCache instruction, however we can do clean
;entry followed by invalidate entry without an interrupt hole, because they
;are for the same virtual address (and that virtual address will not be
;involved in interrupts, since it is involved in remapping)
;
        Push    "a2, lr"
        ADD     a2, a1, #PageSize
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen]
10
        MCR     p15, 0, a1, c7, c10, 1          ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1           ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry
 ]
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4          ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0           ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate BTB
 ]
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c6, 1           ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1           ; invalidate ITLB entry
        CPWAIT
        Pull    "a2, pc"


MMU_ChangingEntries_WB_Cal_LD ROUT
;
;same comments as MMU_ChangingEntry_WB_Cal_LD
;
        Push    "a2, a3, lr"
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen]
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c10, 1             ; clean DCache entry
        MCR     p15, 0, a1, c7, c6, 1              ; invalidate DCache entry
 [ :LNOT:XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 1              ; invalidate ICache entry
 ]
        ADD     a1, a1, a3
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c10, 4             ; drain WBuffer
 [ XScaleJTAGDebug
        MCR     p15, 0, a1, c7, c5, 0              ; invalidate ICache and BTB
 |
        MCR     p15, 0, a1, c7, c5, 6              ; invalidate BTB
 ]
        MOV     a1, lr                             ; restore start address
20
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        ADD     a1, a1, #PageSize
        CMP     a1, a2
        BLO     %BT20
        CPWAIT
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_Cal_LD
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        Pull    "a2, a3, pc"

MMU_ChangingUncachedEntries_WB_Cal_LD ROUT
        CMP     a2, #32                            ; arbitrary-ish threshold
        BHS     %FT20
        Push    "lr"
        MOV     lr, a2
10
        MCR     p15, 0, a1, c8, c6, 1              ; invalidate DTLB entry
        MCR     p15, 0, a1, c8, c5, 1              ; invalidate ITLB entry
        SUBS    lr, lr, #1
        ADD     a1, a1, #PageSize
        BNE     %BT10
        CPWAIT
        Pull    "pc"
;
20
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        CPWAIT
        MOV     pc, lr

 ] ; MEMM_Type = "ARM600"

 [ MEMM_Type = "VMSAv6" ; Need appropriate myIMB, etc. implementations if this is to be removed

; --------------------------------------------------------------------------
; ----- ARMops for Cortex-A8 and the like ----------------------------------
; --------------------------------------------------------------------------

; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with
; register 7, and (potentially) multiple cache levels
;
; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length
; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length
; DCache_RangeThreshold = clean threshold for data cache
; Cache_Lx_Info = Cache level ID register
; Cache_Lx_DTable = Cache size identification register for all 7 data/unified caches
; Cache_Lx_ITable = Cache size identification register for all 7 instruction caches

; ARMv7 cache maintenance routines are a bit long-winded, so we use this macro
; to reduce the risk of mistakes creeping in due to code duplication
;
; $op: Operation to perform ('clean', 'invalidate', 'cleaninvalidate')
; $levels: Which levels to apply to ('lou', 'loc', 'louis')
; Uses r0-r8 & lr as temp
; Performs the indicated op on the indicated data & unified caches
;
; Code based around the alternate/faster code given in the ARMv7 ARM (section
; B2.2.4, alternate/faster code only in doc revision 9), but tightened up a bit
;
; Note that HAL_InvalidateCache_ARMvF uses its own implementation of this
; algorithm, since it must cope with different temporary registers and it needs
; to read the cache info straight from the CP15 registers
;
        MACRO
        MaintainDataCache_WB_CR7_Lx $op, $levels
        LDR     lr, =ZeroPage
        LDR     r0, [lr, #Cache_Lx_Info]!
        ADD     lr, lr, #Cache_Lx_DTable-Cache_Lx_Info
      [ "$levels"="lou"
        ANDS    r3, r0, #&38000000
        MOV     r3, r3, LSR #26 ; Cache level value (naturally aligned)
      |
      [ "$levels"="loc"
        ANDS    r3, r0, #&07000000
        MOV     r3, r3, LSR #23 ; Cache level value (naturally aligned)
      |
      [ "$levels"="louis"
        ANDS    r3, r0, #&00E00000
        MOV     r3, r3, LSR #20 ; Cache level value (naturally aligned)
      |
        ! 1, "Unrecognised levels"
      ]
      ]
      ]
        BEQ     %FT50
        MOV     r8, #0 ; Current cache level
10 ; Loop1
        ADD     r2, r8, r8, LSR #1 ; Work out 3 x cachelevel
        MOV     r1, r0, LSR r2 ; bottom 3 bits are the Cache type for this level
        AND     r1, r1, #7 ; get those 3 bits alone
        CMP     r1, #2
        BLT     %FT40 ; no cache or only instruction cache at this level
        LDR     r1, [lr, r8, LSL #1] ; read CCSIDR to r1
        AND     r2, r1, #CCSIDR_LineSize_mask ; extract the line length field
        ADD     r2, r2, #4 ; add 4 for the line length offset (log2 16 bytes)
        LDR     r7, =CCSIDR_Associativity_mask:SHR:CCSIDR_Associativity_pos
        AND     r7, r7, r1, LSR #CCSIDR_Associativity_pos ; r7 is the max number on the way size (right aligned)
        CLZ     r5, r7 ; r5 is the bit position of the way size increment
        LDR     r4, =CCSIDR_NumSets_mask:SHR:CCSIDR_NumSets_pos
        AND     r4, r4, r1, LSR #CCSIDR_NumSets_pos ; r4 is the max number of the index size (right aligned)
20 ; Loop2
        MOV     r1, r4 ; r1 working copy of the max index size (right aligned)
30 ; Loop3
        ORR     r6, r8, r7, LSL r5 ; factor in the way number and cache number into r6
        ORR     r6, r6, r1, LSL r2 ; factor in the index number
      [ "$op"="clean"
        MCR     p15, 0, r6, c7, c10, 2 ; Clean
      |
      [ "$op"="invalidate"
        MCR     p15, 0, r6, c7, c6, 2 ; Invalidate
      |
      [ "$op"="cleaninvalidate"
        MCR     p15, 0, r6, c7, c14, 2 ; Clean & invalidate
      |
        ! 1, "Unrecognised op"
      ]
      ]
      ]
        SUBS    r1, r1, #1 ; decrement the index
        BGE     %BT30
        SUBS    r7, r7, #1 ; decrement the way number
        BGE     %BT20
        myDSB   ,r7        ; Cortex-A7 errata 814220: DSB required when changing cache levels when using set/way operations. This also counts as our end-of-maintenance DSB.
40 ; Skip
        ADD     r8, r8, #2
        CMP     r3, r8
        BGT     %BT10
50 ; Finished
        MEND

Cache_CleanAll_WB_CR7_Lx ROUT
; Clean cache by traversing all sets and ways for all data caches
        Push    "r1-r8,lr"
        MaintainDataCache_WB_CR7_Lx clean, loc
        Pull    "r1-r8,pc"


Cache_CleanInvalidateAll_WB_CR7_Lx ROUT
;
; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache
;
        Push    "r1-r8,lr"
        MaintainDataCache_WB_CR7_Lx cleaninvalidate, loc
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "r1-r8,pc"


Cache_InvalidateAll_WB_CR7_Lx ROUT
;
; no clean, assume caller knows what's happening
;
        Push    "r1-r8,lr"
        MaintainDataCache_WB_CR7_Lx invalidate, loc
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "r1-r8,pc"


Cache_RangeThreshold_WB_CR7_Lx ROUT
        LDR     a1, =ZeroPage
        LDR     a1, [a1, #DCache_RangeThreshold]
        MOV     pc, lr


; In:  r1 = cache level (0-based)
; Out: r0 = Flags
;           bits 0-2: cache type:
;              000 -> none
;              001 -> instruction
;              010 -> data
;              011 -> split
;              100 -> unified
;              1xx -> reserved
;           Other bits: reserved
;      r1 = D line length
;      r2 = D size
;      r3 = I line length
;      r4 = I size
;      r0-r4 = zero if cache level not present
Cache_Examine_WB_CR7_Lx ROUT
        Entry   "r5"
        LDR     r5, =ZeroPage
        LDR     r0, [r5, #Cache_Lx_Info]!
        ADD     r5, r5, #Cache_Lx_DTable-Cache_Lx_Info
        BIC     r0, r0, #&00E00000
        ; Shift the CLIDR until we hit a zero entry or the desired level
        ; (could shift by exactly the amount we want... but ARM say not to do
        ; that since they may decide to re-use bits)
10
        TEQ     r1, #0
        TSTNE   r0, #7
        SUBNE   r1, r1, #1
        MOVNE   r0, r0, LSR #3
        ADDNE   r5, r5, #4
        BNE     %BT10
        ANDS    r0, r0, #7
        MOV     r1, #0
        MOV     r2, #0
        MOV     r3, #0
        MOV     r4, #0
        EXIT    EQ
        TST     r0, #6 ; Data or unified cache present?
        BEQ     %FT20
        LDR     lr, [r5]
        LDR     r1, =CCSIDR_NumSets_mask:SHR:CCSIDR_NumSets_pos
        LDR     r2, =CCSIDR_Associativity_mask:SHR:CCSIDR_Associativity_pos
        AND     r1, r1, lr, LSR #CCSIDR_NumSets_pos
        AND     r2, r2, lr, LSR #CCSIDR_Associativity_pos
        ADD     r1, r1, #1
        ADD     r2, r2, #1
        MUL     r2, r1, r2
        AND     r1, lr, #CCSIDR_LineSize_mask
        ASSERT  CCSIDR_LineSize_pos = 0
        MOV     lr, #16
        MOV     r1, lr, LSL r1
        MUL     r2, r1, r2
20
        TEQ     r0, #4 ; Unified cache?
        MOVEQ   r3, r1
        MOVEQ   r4, r2
        TST     r0, #1 ; Instruction cache present?
        EXIT    EQ
        LDR     lr, [r5, #Cache_Lx_ITable-Cache_Lx_DTable]
        LDR     r3, =CCSIDR_NumSets_mask:SHR:CCSIDR_NumSets_pos
        LDR     r4, =CCSIDR_Associativity_mask:SHR:CCSIDR_Associativity_pos
        AND     r3, r3, lr, LSR #CCSIDR_NumSets_pos
        AND     r4, r4, lr, LSR #CCSIDR_Associativity_pos
        ADD     r3, r3, #1
        ADD     r4, r4, #1
        MUL     r4, r3, r4
        AND     r3, lr, #CCSIDR_LineSize_mask
        ASSERT  CCSIDR_LineSize_pos = 0
        MOV     lr, #16
        MOV     r3, lr, LSL r3
        MUL     r4, r3, r4
        EXIT


MMU_ChangingUncached_WB_CR7_Lx
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
TLB_InvalidateAll_WB_CR7_Lx ROUT
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


; a1 = page affected (page aligned address)
;
MMU_ChangingUncachedEntry_WB_CR7_Lx
      [ NoARMv7
        Push    "a2"
        myDSB   ,a2    ; Ensure the page table write has actually completed
        myISB   ,a2,,y ; Also required
        Pull    "a2"
      |
        myDSB
        myISB
      ]
TLB_InvalidateEntry_WB_CR7_Lx ROUT
        MCR     p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        MOV     pc, lr


WriteBuffer_Drain_WB_CR7_Lx ROUT
        myDSB   ,a1    ; DSB is the new name for write buffer draining
        myISB   ,a1,,y ; Also do ISB for extra paranoia
        MOV     pc, lr


IMB_Full_WB_CR7_Lx ROUT
;
; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor
; Luckily, we only need to clean as far as the level of unification
;
        Push    "r1-r8,lr"
        MaintainDataCache_WB_CR7_Lx clean, lou
        MCR     p15, 0, a1, c7, c5, 0 ; invalidate ICache
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1,,y                ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "r1-r8,pc"

;  a1 = start address (inclusive, cache line aligned)
;  a2 = end address (exclusive, cache line aligned)
;
IMB_Range_WB_CR7_Lx ROUT
        SUB     a2, a2, a1
        CMP     a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take
        ADD     a2, a2, a1
        BHS     IMB_Full_WB_CR7_Lx
        Push    "a1,a3,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a3, #4
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c11, 1           ; clean DCache entry by VA to PoU
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        myDSB   ,a1  ; Wait for clean to complete
        Pull    "a1" ; Get start address back
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     lr, a3, LSL lr
10
        MCR     p15, 0, a1, c7, c5, 1            ; invalidate ICache entry
        ADD     a1, a1, lr
        CMP     a1, a2
        BLO     %BT10
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a3,pc"

;  a1 = pointer to list of (start, end) address pairs
;  a2 = pointer to end of list
;  a3 = total amount of memory to be synchronised
;
IMB_List_WB_CR7_Lx ROUT
        CMP     a3, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take
        BHS     IMB_Full_WB_CR7_Lx
        Push    "a1,a3,v1-v2,lr"
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a3, #4
        MOV     lr, a3, LSL lr
05
        LDMIA   a1!, {v1-v2}
10
        MCR     p15, 0, v1, c7, c11, 1           ; clean DCache entry by VA to PoU
        ADD     v1, v1, lr
        CMP     v1, v2
        BLO     %BT10
        CMP     a1, a2
        BNE     %BT05
        myDSB   ,a1  ; Wait for clean to complete
        Pull    "a1" ; Get start address back
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     lr, a3, LSL lr
05
        LDMIA   a1!, {v1-v2}
10
        MCR     p15, 0, v1, c7, c5, 1            ; invalidate ICache entry
        ADD     v1, v1, lr
        CMP     v1, v2
        BLO     %BT10
        CMP     a1, a2
        BNE     %BT05
        MCR     p15, 0, a1, c7, c5, 6 ; invalidate branch predictors
        myDSB   ,a1                   ; Wait for cache/branch invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects of the completed cache/branch invalidation are visible
        Pull    "a3,v1-v2,pc"

MMU_Changing_WB_CR7_Lx ROUT
        Push    "lr"
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait for TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "pc"

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_WB_CR7_Lx ROUT
        Push    "a2, lr"
        myDSB   ,lr ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     a2, #4
        MOV     lr, a2, LSL lr
        ADD     a2, a1, #PageSize
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,lr ; Wait for clean to complete
        LDR     lr, =ZeroPage
        LDRB    lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     lr, a1, LSL lr
        SUB     a1, a2, #PageSize ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, lr
        CMP     a1, a2
        BNE     %BT10
        SUB     a1, a1, #PageSize
        MCR     p15, 0, a1, c8, c7, 1           ; invalidate DTLB and ITLB
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_WB_CR7_Lx ROUT
        Push    "a2, a3, lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        MOV     a2, a2, LSL #Log2PageSize
        LDR     lr, =ZeroPage
        LDR     a3, [lr, #DCache_RangeThreshold]   ;check whether cheaper to do global clean
        CMP     a2, a3
        BHS     %FT30
        ADD     a2, a2, a1                         ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     lr, #4
        MOV     a3, lr, LSL a3
        MOV     lr, a1
10
        MCR     p15, 0, a1, c7, c14, 1          ; clean&invalidate DCache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,a3 ; Wait for clean to complete
        LDR     a3, =ZeroPage
        LDRB    a3, [a3, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ
        MOV     a1, #4
        MOV     a3, a1, LSL a3
        MOV     a1, lr ; Get start address back
10
        MCR     p15, 0, a1, c7, c5, 1           ; invalidate ICache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
20
        MCR     p15, 0, lr, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     lr, lr, #PageSize
        CMP     lr, a2
        BNE     %BT20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,a1
        myISB   ,a1,,y
        Pull    "a2, a3, pc"
;
30
        BL      Cache_CleanInvalidateAll_WB_CR7_Lx
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                ; Wait TLB invalidation to complete
        myISB   ,a1,,y                ; Ensure that the effects are visible
        Pull    "a2, a3, pc"

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT
        Push    "a2,lr"
        myDSB   ,lr    ; Ensure the page table write has actually completed
        myISB   ,lr,,y ; Also required
        CMP     a2, #32                            ; arbitrary-ish threshold
        MCRHS   p15, 0, a1, c8, c7, 0              ; invalidate ITLB and DTLB
        BHS     %FT20
10
        MCR     p15, 0, a1, c8, c7, 1              ; invalidate DTLB & ITLB entry
        ADD     a1, a1, #PageSize
        SUBS    a2, a2, #1
        BNE     %BT10
20
        MCR     p15, 0, a1, c7, c5, 6           ; invalidate branch predictors
        myDSB   ,lr,,y
        myISB   ,lr,,y
        Pull    "a2,pc"

; --------------------------------------------------------------------------
; ----- ARMops for PL310 L2 cache controller--------------------------------
; --------------------------------------------------------------------------

; These are a hybrid of the standard ARMv7 ARMops (WB_CR7_Lx) and the PL310
; cache maintenance ops. Currently they're only used on Cortex-A9 systems, so
; may need modifications to work with other systems.
; Specifically, the code assumes the PL310 is being used in non-exclusive mode.

        MACRO
        PL310Sync $regs, $temp
        ; Errata 753970 requires us to write to a different location when
        ; performing a sync operation for r3p0
        LDR     $temp, [$regs, #PL310_REG0_CACHE_ID]
        AND     $temp, $temp, #&3f
        TEQ     $temp, #PL310_R3P0
        MOV     $temp, #0
        STREQ   $temp, [$regs, #PL310_REG7_CACHE_SYNC_753970]
        STRNE   $temp, [$regs, #PL310_REG7_CACHE_SYNC]
10
        LDR     $temp, [$regs, #PL310_REG7_CACHE_SYNC]
        TST     $temp, #1
        BNE     %BT10
        MEND

Cache_CleanInvalidateAll_PL310 ROUT
        ; Errata 727915 workaround - use CLEAN_INV_INDEX instead of CLEAN_INV_WAY
        Entry   "a2-a4"
        LDR     a2, =ZeroPage
        LDR     a2, [a2, #Cache_HALDevice]
        LDR     a2, [a2, #HALDevice_Address]
        ; Clean ARM caches
        BL      Cache_CleanAll_WB_CR7_Lx
        ; Determine PL310 way, index count
        LDR     a1, [a2, #PL310_REG1_AUX_CONTROL]
        AND     a3, a1, #1<<16
        AND     a1, a1, #7<<17
        MOV     a3, a3, LSL #15
        MOV     a1, a1, LSR #17
        LDR     a4, =&FF<<5
        ORR     a3, a3, #7<<28          ; a3 = max way number (inclusive)
        ORR     a4, a4, a4, LSL a1      ; a4 = max index number (inclusive)
        ; Ensure no operation currently in progress
05
        LDR     lr, [a2, #PL310_REG7_CLEAN_INV_INDEX]
        TST     lr, #1
        BNE     %BT05
10
        ORR     a1, a3, a4
20
        STR     a1, [a2, #PL310_REG7_CLEAN_INV_INDEX]
30
        LDR     lr, [a2, #PL310_REG7_CLEAN_INV_INDEX]
        TST     lr, #1
        BNE     %BT30
        SUBS    a1, a1, #1<<28          ; next way
        BCS     %BT20                   ; underflow?
        SUBS    a4, a4, #1<<5           ; next index
        BGE     %BT10
        ; Sync PL310
        PL310Sync a2, a1
        ; Clean & invalidate ARM caches
        PullEnv
        B       Cache_CleanInvalidateAll_WB_CR7_Lx

Cache_CleanAll_PL310 ROUT
        Entry   "a2"
        LDR     a2, =ZeroPage
        LDR     a2, [a2, #Cache_HALDevice]
        LDR     a2, [a2, #HALDevice_Address]
        ; Clean ARM caches
        BL      Cache_CleanAll_WB_CR7_Lx
        ; Clean PL310
        LDR     a1, [a2, #PL310_REG1_AUX_CONTROL]
        TST     a1, #1<<16
        MOV     a1, #&FF
        ORRNE   a1, a1, #&FF00          ; Mask of all ways
        STR     a1, [a2, #PL310_REG7_CLEAN_WAY]
10
        LDR     a1, [a2, #PL310_REG7_CLEAN_WAY]
        TEQ     a1, #0
        BNE     %BT10
        ; Sync PL310
        PL310Sync a2, a1
        EXIT

Cache_InvalidateAll_PL310 ROUT
        Entry   "a2"
        LDR     a2, =ZeroPage
        LDR     a2, [a2, #Cache_HALDevice]
        LDR     a2, [a2, #HALDevice_Address]
        ; Invalidate PL310
        LDR     a1, [a2, #PL310_REG1_AUX_CONTROL]
        TST     a1, #1<<16
        MOV     a1, #&FF
        ORRNE   a1, a1, #&FF00          ; Mask of all ways
        STR     a1, [a2, #PL310_REG7_INV_WAY]
10
        LDR     a1, [a2, #PL310_REG7_INV_WAY]
        TEQ     a1, #0
        BNE     %BT10
        ; Sync PL310
        PL310Sync a2, a1
        ; Invalidate ARM caches
        PullEnv
        B       Cache_InvalidateAll_WB_CR7_Lx

Cache_RangeThreshold_PL310 ROUT
        MOV     a1, #1024*1024
        MOV     pc, lr

Cache_Examine_PL310 ROUT
        ; Assume that the PL310 is the level 2 cache
        CMP     r1, #1
        BLT     Cache_Examine_WB_CR7_Lx
        MOVGT   r0, #0
        MOVGT   r1, #0
        MOVGT   r2, #0
        MOVGT   r3, #0
        MOVGT   r4, #0
        MOVGT   pc, lr
        LDR     r0, =ZeroPage
        LDR     r0, [r0, #Cache_HALDevice]
        LDR     r0, [r0, #HALDevice_Address]
        LDR     r0, [r0, #PL310_REG1_AUX_CONTROL]
        AND     r2, r0, #&E0000 ; Get way size
        TST     r0, #1:SHL:16 ; Check associativity
        MOV     r2, r2, LSR #17
        MOVEQ   r1, #8*1024*8 ; 8KB base way size with 8 way associativity
        MOVNE   r1, #8*1024*16 ; 8KB base way size with 16 way associativity
        MOV     r2, r1, LSL r2
        ; Assume this really is a PL310 (32 byte line size, unified architecture)
        MOV     r0, #4
        MOV     r1, #32
        MOV     r3, #32
        MOV     r4, r2
        MOV     pc, lr

WriteBuffer_Drain_PL310 ROUT
        Entry
        LDR     lr, =ZeroPage
        LDR     lr, [lr, #Cache_HALDevice]
        LDR     lr, [lr, #HALDevice_Address]
        ; Drain ARM write buffer
        myDSB   ,a1    ; DSB is the new name for write buffer draining
        myISB   ,a1,,y ; Also do ISB for extra paranoia
        ; Drain PL310 write buffer
        PL310Sync lr, a1
        EXIT

MMU_Changing_PL310 ROUT
        Entry
        myDSB   ,a1    ; Ensure the page table write has actually completed
        myISB   ,a1,,y ; Also required
        BL      Cache_CleanInvalidateAll_PL310
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0   ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                  ; Wait for TLB invalidation to complete
        myISB   ,a1,,y                  ; Ensure that the effects are visible
        EXIT

; a1 = page affected (page aligned address)
;
MMU_ChangingEntry_PL310 ROUT
        Entry   "a1-a4"
        ; MMU_ChangingEntry_WB_CR7_Lx performs a clean & invalidate before invalidating the TLBs.
        ; This means we must behave in a similar way to the PL310 clean & invalidate:
        ; * Clean ARM
        ; * Clean & invalidate PL310
        ; * Clean & invalidate ARM (i.e. do the MMU changing op)

        ; Convert logical addr to physical.
        ; Use the ARMv7 CP15 registers for convenience.
        PHPSEI
        MCR     p15, 0, a1, c7, c8, 0   ; ATS1CPR
        myISB   ,a4
        MRC     p15, 0, a4, c7, c4, 0   ; Get result
        PLP
        TST     a4, #1
        BNE     %FT50                   ; Lookup failed - assume this means that the page doesn't need cleaning from the PL310
        ; Mask out the memory attributes that were returned by the lookup
      [ NoARMT2
        BIC     a4, a4, #&FF
        BIC     a4, a4, #&F00
      |
        BFC     a4, #0, #12
      ]
        ; Clean ARM
        myDSB   ,lr
        myISB   ,lr,,y
        LDR     lr, =ZeroPage
        ADD     a2, a1, #PageSize       ;clean end address (exclusive)
        LDRB    a3, [lr, #DCache_LineLen] ; log2(line len)-2
        MOV     lr, #4
        MOV     a3, lr, LSL a3
10
        MCR     p15, 0, a1, c7, c10, 1  ; clean DCache entry to PoC
        ADD     a1, a1, a3
        CMP     a1, a2
        BNE     %BT10
        myDSB   ,a3 ; Wait for clean to complete
        ; Clean & invalidate PL310
        LDR     a2, =ZeroPage
        LDR     a2, [a2, #Cache_HALDevice]
        LDR     a2, [a2, #HALDevice_Address]
        ; Ensure we haven't re-entered an in-progress op
20
        LDR     lr, [a2, #PL310_REG7_CLEAN_INV_PA]
        TST     lr, #1
        BNE     %BT20
        ; Clean & invalidate each line/index of the page
        ADD     a1, a4, #&FE0           ; last line within the page
30
        STR     a4, [a2, #PL310_REG7_CLEAN_INV_PA]
40
        LDR     lr, [a2, #PL310_REG7_CLEAN_INV_PA]
        TST     lr, #1
        BNE     %BT40
        TEQ     a4, a1
        ADD     a4, a4, #1<<5           ; next index
        BNE     %BT30
        ; Sync
        PL310Sync a2, a1
50
        ; Clean & invalidate ARM (+ do MMU op)
        PullEnv
        B       MMU_ChangingEntry_WB_CR7_Lx

; a1 = first page affected (page aligned address)
; a2 = number of pages
;
MMU_ChangingEntries_PL310 ROUT
        Entry   "a2-a3"
        ; Keep this one simple and just split it into a series of per-page operations
        ; This will result in some unnecessary TLB invalidate & PL310 sync thrashing, so in the future a more advanced implementation might be nice.
        CMP     a2, #1024*1024/PageSize ; Arbitrary threshold for full clean
        BHS     %FT20
        MOV     a3, a1
10
        MOV     a1, a3
        BL      MMU_ChangingEntry_PL310
        SUBS    a2, a2, #1
        ADD     a3, a3, #PageSize
        BNE     %BT10
        EXIT
20
        ; Full clean required
        BL      Cache_CleanInvalidateAll_PL310
        MOV     a1, #0
        MCR     p15, 0, a1, c8, c7, 0   ; invalidate ITLB and DTLB
        myDSB   ,a1,,y                  ; Wait TLB invalidation to complete
        myISB   ,a1,,y                  ; Ensure that the effects are visible
        EXIT

 ] ; MEMM_Type = "VMSAv6"

; --------------------------------------------------------------------------

LookForHALCacheController ROUT
        Entry   "r0-r3,r8,r12"
        ; Look for any known cache controllers that the HAL has registered, and
        ; replace our ARMop routines with the appropriate routines for that
        ; controller
        LDR     r0, =(0:SHL:16)+HALDeviceType_SysPeri+HALDeviceSysPeri_CacheC
        MOV     r1, #0
        LDR     r12, =ZeroPage
        STR     r1, [r12, #Cache_HALDevice] ; In case none found
10
        MOV     r8, #OSHW_DeviceEnumerate
        SWI     XOS_Hardware
        EXIT    VS
        CMP     r1, #-1
        EXIT    EQ
        ; Do we recognise this controller?
        ASSERT  HALDevice_ID = 2
      [ NoARMv4
        LDR     lr, [r2]
        MOV     lr, lr, LSR #16
      |
        LDRH    lr, [r2, #HALDevice_ID]
      ]
        ADR     r8, KnownHALCaches
20
        LDR     r12, [r8], #8+Proc_MMU_ChangingUncachedEntries-Proc_Cache_CleanInvalidateAll
        CMP     r12, #-1
        BEQ     %BT10
        CMP     lr, r12
        BNE     %BT20
        ; Cache recognised. Disable IRQs for safety, and then try enabling it.
        Push    "r2"
        MOV     r0, r2
        MSR     CPSR_c, #SVC32_mode+I32_bit
        MOV     lr, pc
        LDR     pc, [r2, #HALDevice_Activate]
        CMP     r0, #1
        Pull    "r2"
        MSRNE   CPSR_c, #SVC32_mode
        BNE     %BT10
        ; Cache enabled OK - remember the device pointer and patch our maintenance ops
        LDR     r0, =ZeroPage
        STR     r2, [r0, #Cache_HALDevice]
        ADD     r0, r0, #Proc_Cache_CleanInvalidateAll
        MOV     r1, #Proc_MMU_ChangingUncachedEntries-Proc_Cache_CleanInvalidateAll
30
        LDR     r3, [r8, #-4]!
        TEQ     r3, #0
        STRNE   r3, [r0, r1]
        SUBS    r1, r1, #4
        BGE     %BT30
        ; It's now safe to restore IRQs
        MSR     CPSR_c, #SVC32_mode
        EXIT

KnownHALCaches ROUT
      [ MEMM_Type = "VMSAv6"
        DCD     HALDeviceID_CacheC_PL310
01
        DCD     Cache_CleanInvalidateAll_PL310
        DCD     Cache_CleanAll_PL310
        DCD     Cache_InvalidateAll_PL310
        DCD     Cache_RangeThreshold_PL310
        DCD     Cache_Examine_PL310
        DCD     0 ; TLB_InvalidateAll
        DCD     0 ; TLB_InvalidateEntry
        DCD     WriteBuffer_Drain_PL310
        DCD     0 ; IMB_Full
        DCD     0 ; IMB_Range
        DCD     0 ; IMB_List
        DCD     MMU_Changing_PL310
        DCD     MMU_ChangingEntry_PL310
        DCD     0 ; MMU_ChangingUncached
        DCD     0 ; MMU_ChangingUncachedEntry
        DCD     MMU_ChangingEntries_PL310
        DCD     0 ; MMU_ChangingUncachedEntries
        ASSERT  . - %BT01 = 4+Proc_MMU_ChangingUncachedEntries-Proc_Cache_CleanInvalidateAll
      ]
        DCD     -1

; --------------------------------------------------------------------------

; ARMops exposed by OS_MMUControl 2
ARMopPtrTable
        DCD     ZeroPage + Proc_Cache_CleanInvalidateAll
        DCD     ZeroPage + Proc_Cache_CleanAll
        DCD     ZeroPage + Proc_Cache_InvalidateAll
        DCD     ZeroPage + Proc_Cache_RangeThreshold
        DCD     ZeroPage + Proc_TLB_InvalidateAll
        DCD     ZeroPage + Proc_TLB_InvalidateEntry
        DCD     ZeroPage + Proc_WriteBuffer_Drain
        DCD     ZeroPage + Proc_IMB_Full
        DCD     ZeroPage + Proc_IMB_Range
        DCD     ZeroPage + Proc_IMB_List
        DCD     ZeroPage + Proc_MMU_Changing
        DCD     ZeroPage + Proc_MMU_ChangingEntry
        DCD     ZeroPage + Proc_MMU_ChangingUncached
        DCD     ZeroPage + Proc_MMU_ChangingUncachedEntry
        DCD     ZeroPage + Proc_MMU_ChangingEntries
        DCD     ZeroPage + Proc_MMU_ChangingUncachedEntries
ARMopPtrTable_End

;        IMPORT  Write0_Translated

ARM_PrintProcessorType
        LDR     a1, =ZeroPage
        LDRB    a1, [a1, #ProcessorType]
        TEQ     a1, #ARMunk
        MOVEQ   pc, lr

        Push    "lr"
        ADR     a2, PNameTable
        LDHA    a1, a2, a1, a3
        ADD     a1, a2, a1
      [ International
        BL      Write0_Translated
      |
        SWI     XOS_Write0
      ]
        SWI     XOS_NewLine
        SWI     XOS_NewLine
        Pull    "pc"

PNameTable
        DCW     PName_ARM600    - PNameTable
        DCW     PName_ARM610    - PNameTable
        DCW     PName_ARM700    - PNameTable
        DCW     PName_ARM710    - PNameTable
        DCW     PName_ARM710a   - PNameTable
        DCW     PName_SA110     - PNameTable      ; pre rev T
        DCW     PName_SA110     - PNameTable      ; rev T or later
        DCW     PName_ARM7500   - PNameTable
        DCW     PName_ARM7500FE - PNameTable
        DCW     PName_SA1100    - PNameTable
        DCW     PName_SA1110    - PNameTable
        DCW     PName_ARM720T   - PNameTable
        DCW     PName_ARM920T   - PNameTable
        DCW     PName_ARM922T   - PNameTable
        DCW     PName_X80200    - PNameTable
        DCW     PName_X80321    - PNameTable
        DCW     PName_ARM1176JZF_S - PNameTable
        DCW     PName_Cortex_A5 - PNameTable
        DCW     PName_Cortex_A7 - PNameTable
        DCW     PName_Cortex_A8 - PNameTable
        DCW     PName_Cortex_A9 - PNameTable
        DCW     PName_Cortex_A17 - PNameTable     ; A12 rebranded as A17
        DCW     PName_Cortex_A15 - PNameTable
        DCW     PName_Cortex_A17 - PNameTable

PName_ARM600
        =       "600:ARM 600 Processor",0
PName_ARM610
        =       "610:ARM 610 Processor",0
PName_ARM700
        =       "700:ARM 700 Processor",0
PName_ARM710
        =       "710:ARM 710 Processor",0
PName_ARM710a
        =       "710a:ARM 710a Processor",0
PName_SA110
        =       "SA110:SA-110 Processor",0
PName_ARM7500
        =       "7500:ARM 7500 Processor",0
PName_ARM7500FE
        =       "7500FE:ARM 7500FE Processor",0
PName_SA1100
        =       "SA1100:SA-1100 Processor",0
PName_SA1110
        =       "SA1110:SA-1110 Processor",0
PName_ARM720T
        =       "720T:ARM 720T Processor",0
PName_ARM920T
        =       "920T:ARM 920T Processor",0
PName_ARM922T
        =       "922T:ARM 922T Processor",0
PName_X80200
        =       "X80200:80200 Processor",0
PName_X80321
        =       "X80321:80321 Processor",0
PName_ARM1176JZF_S
        =       "ARM1176JZF_S:ARM1176JZF-S Processor",0
PName_Cortex_A5
        =       "CA5:Cortex-A5 Processor",0
PName_Cortex_A7
        =       "CA7:Cortex-A7 Processor",0
PName_Cortex_A8
        =       "CA8:Cortex-A8 Processor",0
PName_Cortex_A9
        =       "CA9:Cortex-A9 Processor",0
PName_Cortex_A15
        =       "CA15:Cortex-A15 Processor",0
PName_Cortex_A17
        =       "CA17:Cortex-A17 Processor",0
        ALIGN


; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0)
; to XCB bits in page table descriptors.

XCB_CB  *       0:SHL:0
XCB_NB  *       1:SHL:0
XCB_NC  *       1:SHL:1
XCB_P   *       1:SHL:2
 [ MEMM_Type = "VMSAv6"
XCB_TU  *       1:SHL:5 ; For VMSAv6, deal with temp uncacheable via the table
 ]

        ALIGN 32

 [ MEMM_Type = "ARM600"

; WT read-allocate cache (eg ARM720T)
XCBTableWT                                      ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B, L2_C, L2_B, 0              ;        Default
        = L2_C+L2_B, L2_C, L2_B, 0              ; WT,         WT, Non-merging, X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/RA,      WB, Merging,     X
        = L2_C+L2_B, L2_C, L2_B, 0              ; WB/WA,      X,  Idempotent,  X
        = L2_C+L2_B, L2_C, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B, L2_C, L2_B, 0              ; X,          X,  X,           X

; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB
XCBTableSA110                                   ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0, L2_B, 0              ; WT,         WT, Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      WB, Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  Idempotent,  X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T)
XCBTableWBR                                     ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        = L2_C     ,    0, L2_B, 0              ; WT,         WT, Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      WB, Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  Idempotent,  X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; SA-1110 - WB only read allocate cache, merging WB, mini D-cache
XCBTableSA1110                                  ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        =      L2_B,    0,    0, 0              ; WT,         WT, Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      WB, Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  Idempotent,  X
        = L2_C     ,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to read-allocate
XCBTableXScaleRA                                ; C+B        CNB   NCB         NCNB
        =      L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         WT, Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      WB, Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  Idempotent,  X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        =      L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache
;          defaulting to write-allocate
XCBTableXScaleWA                                ; C+B        CNB   NCB         NCNB
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ;        Default
        =      L2_C     ,    0, L2_X+L2_B, 0    ; WT,         WT, Non-merging, X
        =      L2_C+L2_B,    0,      L2_B, 0    ; WB/RA,      WB, Merging,     X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; WB/WA,      X,  Idempotent,  X
        = L2_X+L2_C     ,    0,      L2_B, 0    ; Alt DCache, X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X
        = L2_X+L2_C+L2_B,    0,      L2_B, 0    ; X,          X,  X,           X

; XScale - WB/WT read-allocate cache, merging WB, no mini D-cache/extended pages
XCBTableXScaleNoExt                             ; C+B        CNB   NCB         NCNB
        = L2_C+L2_B,    0, L2_B, 0              ;        Default
        = L2_C     ,    0,    0, 0              ; WT,         WT, Non-merging, X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/RA,      WB, Merging,     X
        = L2_C+L2_B,    0, L2_B, 0              ; WB/WA,      X,  Idempotent,  X
        = L2_C+L2_B,    0, L2_B, 0              ; Alt DCache, X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X
        = L2_C+L2_B,    0, L2_B, 0              ; X,          X,  X,           X

 ] ; MEMM_Type = "ARM600"

 [ MEMM_Type = "VMSAv6"

; VMSAv6/v7 L2 memory attributes (short descriptor format, TEX remap disabled, identical inner/outer attributes)

L2_SO_S     * 0                             ; Strongly-ordered, shareable
L2_Dev_S    * L2_B                          ; Device, shareable
L2_Nrm_WT   * L2_C                          ; Normal, WT/RA, S bit determines shareability
L2_Nrm_WBRA * L2_C+L2_B                     ; Normal, WB/RA, S bit determines shareability
L2_Nrm_NC   * 1:SHL:L2_TEXShift             ; Normal, non-cacheable (but bufferable), S bit determines shareability
L2_Nrm_WBWA * (1:SHL:L2_TEXShift)+L2_C+L2_B ; Normal, WB/WA, S bit determines shareability
L2_Dev_nS   * 2:SHL:L2_TEXShift             ; Device, non-shareable

; Generic XCB table for VMSAv6/v7

; * NCNB is roughly equivalent to "strongly ordered".
; * NCB with non-merging write buffer is equivalent to "Device".
; * NCB with merging write buffer is also mapped to "Device". "Normal" is
;   tempting but may result in issues with read-sensitive devices (see below).
; * For NCB with devices which aren't read-sensitive, we introduce a new
;   "Merging write buffer with idempotent memory" policy which maps to the
;   Normal, non-cacheable type. This will degrade nicely on older OS's and CPUs,
;   avoiding some isses if we were to make NCB with merging write buffer default
;   to Normal memory. This policy is also the new default, so that all existing
;   NCB RAM uses it (so unaligned loads, etc. will work). No existing code seems
;   to be using NCB for IO devices (only for IO RAM like VRAM), so this change
;   should be safe (previously, all NCB policies would have mapped to Device
;   memory)
; * CNB has no equivalent - there's no control over whether the write buffer is
;   used for cacheable regions, so we have to downgrade to NCNB.

; The caches should behave sensibly when given unsupported attributes
; (downgrade WB to WT to NC), but we may end up doing more cache maintenance
; than needed if the hardware downgrades some areas to NC.

XCBTableVMSAv6                                     ; C+B        CNB   NCB         NCNB
        = L2_Nrm_WBRA, L2_SO_S, L2_Dev_S,  L2_SO_S ;        Default
        = L2_Nrm_WT,   L2_SO_S, L2_Dev_S,  L2_SO_S ; WT,         WT, Non-merging, X
        = L2_Nrm_WBRA, L2_SO_S, L2_Dev_S,  L2_SO_S ; WB/RA,      WB, Merging,     X
        = L2_Nrm_WBWA, L2_SO_S, L2_Nrm_NC, L2_SO_S ; WB/WA,      X,  Idempotent,  X
        = L2_Nrm_WBRA, L2_SO_S, L2_Nrm_NC, L2_SO_S ; Alt DCache, X,  X,           X
        = L2_Nrm_WBRA, L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X
        = L2_Nrm_WBRA, L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X
        = L2_Nrm_WBRA, L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X
        ; This second set of entries deals with when pages are made
        ; temporarily uncacheable - we need to change the cacheability without
        ; changing the memory type.
        = L2_Nrm_NC,   L2_SO_S, L2_Dev_S,  L2_SO_S ;        Default
        = L2_Nrm_NC,   L2_SO_S, L2_Dev_S,  L2_SO_S ; WT,         WT, Non-merging, X
        = L2_Nrm_NC,   L2_SO_S, L2_Dev_S,  L2_SO_S ; WB/RA,      WB, Merging,     X
        = L2_Nrm_NC,   L2_SO_S, L2_Nrm_NC, L2_SO_S ; WB/WA,      X,  Idempotent,  X
        = L2_Nrm_NC,   L2_SO_S, L2_Nrm_NC, L2_SO_S ; Alt DCache, X,  X,           X
        = L2_Nrm_NC,   L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X
        = L2_Nrm_NC,   L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X
        = L2_Nrm_NC,   L2_SO_S, L2_Nrm_NC, L2_SO_S ; X,          X,  X,           X

 ] ; MEMM_Type = "VMSAv6"

        END