; Copyright 2000 Pace Micro Technology plc ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; GET Hdr:ListOpts ; GET Hdr:Macros ; GET Hdr:System ; $GetCPU ; $GetMEMM ; GET hdr.Options ; GET Hdr:PublicWS ; GET Hdr:KernelWS ; GET hdr.Copro15ops ; GET hdr.ARMops v7 RN 10 ; EXPORT Init_ARMarch ; EXPORT ARM_Analyse ; EXPORT ARM_PrintProcessorType ; AREA KernelCode,CODE,READONLY ; ARM keep changing their mind about ID field layout. ; Here's a summary, courtesy of the ARM ARM (v5): ; ; pre-ARM 7: xxxx0xxx ; ARM 7: xxxx7xxx where bit 23 indicates v4T/~v3 ; post-ARM 7: xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T) ; ; int Init_ARMarch(void) ; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later. ; Corrupts only ip, no RAM usage. Init_ARMarch ARM_read_ID ip ANDS a1, ip, #&0000F000 MOVEQ pc, lr ; ARM 3 or ARM 6 TEQ a1, #&00007000 BNE %FT20 TST ip, #&00800000 ; ARM 7 - check for Thumb MOVNE a1, #ARMv4T MOVEQ a1, #ARMv3 MOV pc, lr 20 ANDS a1, ip, #&000F0000 ; post-ARM 7 MOV a1, a1, LSR #16 MOV pc, lr ARM_Analyse MOV a2, lr BL Init_ARMarch MOV lr, a2 [ MEMM_Type = "VMSAv6" CMP a1, #ARMvF BEQ ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup ] Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 ARM_read_cachetype v2 LDR v6, =ZeroPage ADRL v7, KnownCPUTable FindARMloop LDMIA v7!, {a1, a2} ; See if it's a known ARM CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE FindARMloop TEQ v2, v1 ; If we don't have cache attributes, read from table LDREQ v2, [v7] 20 TEQ v2, v1 BEQ %BT20 ; Cache unknown: panic CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] ASSERT CT_Isize_pos = 0 MOV a1, v2 ADD a2, v6, #ICache_Info BL EvaluateCache MOV a1, v2, LSR #CT_Dsize_pos ADD a2, v6, #DCache_Info BL EvaluateCache AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] [ No26bitCode MOV v5, #CPUFlag_32bitOS | MOV v5, #0 ] [ HiProcVecs ORR v5, v5, #CPUFlag_HiProcVecs ] TST v2, #CT_S ORRNE v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 [ 0=1 ; Check whether 26-bit mode is available MSR CPSR_c, #F32_bit+I32_bit+SVC26_mode MRS a1, CPSR AND a1, a1, #M32_bits TEQ a1, #SVC26_mode ORRNE v5, v5, #CPUFlag_No26bitMode MSREQ CPSR_c, #F32_bit+I32_bit+SVC32_mode BNE %FT35 ; Do we get vector exceptions on read? LDR a2, =ZeroPage MOV a1, a2 LDR a1, [a1] ; If this aborts a1 will be left unchanged TEQ a1, a2 ORREQ v5, v5, #CPUFlag_VectorReadException ] 35 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] TEQ a1, #ARMv3 ; assume long multiply available ORRNE v5, v5, #CPUFlag_LongMul ; if v4 or later TEQNE a1, #ARMv4 ; assume 26-bit available ORRNE v5, v5, #CPUFlag_No26bitMode ; iff v3 or v4 (not T) TEQNE a1, #ARMv5 ; assume Thumb available ORRNE v5, v5, #CPUFlag_Thumb ; iff not v3,v4,v5 MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 [ XScaleJTAGDebug TST v5, #CPUFlag_XScale BEQ %FT40 MRC p14, 0, a2, c10, c0 ; Read debug control register TST a2, #&80000000 ORRNE v5, v5, #CPUFlag_XScaleJTAGconnected MOVEQ a2, #&C000001C ; enable hot debug MCREQ p14, 0, a2, c10, c0 BNE %FT40 40 ] STR v5, [v6, #ProcessorFlags] ; Now, a1 = processor architecture (ARMv3, ARMv4 ...) ; v4 = processor type (ARM600, ARM610, ...) ; v5 = processor flags CMP a1, #ARMv4 BLO Analyse_ARMv3 ; eg. ARM710 LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa BEQ Analyse_WB_CR7_LDa ; eg. ARM9 TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD BEQ Analyse_WB_Cal_LD ; assume XScale ; others ... WeirdARMPanic B WeirdARMPanic ; stiff :) Analyse_ARMv3 ADRL a1, NullOp ADRL a2, Cache_Invalidate_ARMv3 ADRL a3, WriteBuffer_Drain_ARMv3 ADRL a4, TLB_Invalidate_ARMv3 ADRL ip, TLB_InvalidateEntry_ARMv3 STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_ARMv3 ADRL a2, MMU_ChangingEntry_ARMv3 ADRL a3, MMU_ChangingUncached_ARMv3 ADRL a4, MMU_ChangingUncachedEntry_ARMv3 STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_ARMv3 ADRL a2, MMU_ChangingUncachedEntries_ARMv3 ADRL a3, Cache_RangeThreshold_ARMv3 STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WriteThroughUnified ADRL a1, NullOp ADRL a2, Cache_InvalidateUnified TST v5, #CPUFlag_NoWBDrain ADRNEL a3, WriteBuffer_Drain_OffOn ADREQL a3, WriteBuffer_Drain ADRL a4, TLB_Invalidate_Unified ADRL ip, TLB_InvalidateEntry_Unified STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_Writethrough ADRL a2, MMU_ChangingEntry_Writethrough ADRL a3, MMU_ChangingUncached ADRL a4, MMU_ChangingUncachedEntry STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_Writethrough ADRL a2, MMU_ChangingUncachedEntries ADRL a3, Cache_RangeThreshold_Writethrough STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_CR7_LDa TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here (eg. ARM920) ADRL a1, Cache_CleanInvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_LDa STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_LDa STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_LDa STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDRB a2, [v6, #DCache_Associativity] MOV a3, #256 MOV a4, #8 ; to find log2(ASSOC), rounded up Analyse_WB_CR7_LDa_L1 MOV a3, a3, LSR #1 SUB a4, a4, #1 CMP a2, a3 BLO Analyse_WB_CR7_LDa_L1 ADDHI a4, a4, #1 RSB a2, a4, #32 MOV a3, #1 MOV a3, a3, LSL a2 STR a3, [v6, #DCache_IndexBit] LDR a4, [v6, #DCache_NSets] LDRB a2, [v6, #DCache_LineLen] SUB a4, a4, #1 MUL a4, a2, a4 STR a4, [v6, #DCache_IndexSegStart] MOV a2, #64*1024 ; arbitrary-ish STR a2, [v6, #DCache_RangeThreshold] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_Crd TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Crd STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Crd STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Crd STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Crd STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Crd STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDR a2, =DCacheCleanAddress STR a2, [v6, #DCache_CleanBaseAddress] STR a2, [v6, #DCache_CleanNextAddress] MOV a2, #64*1024 ;arbitrary-ish threshold STR a2, [v6, #DCache_RangeThreshold] LDRB a2, [v6, #ProcessorType] TEQ a2, #SA110 ADREQL a2, XCBTableSA110 BEQ Analyse_WB_Crd_finish TEQ a2, #SA1100 TEQNE a2, #SA1110 ADREQL a2, XCBTableSA1110 ADRNEL a2, XCBTableWBR Analyse_WB_Crd_finish STR a2, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_Cal_LD TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Cal_LD STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Cal_LD STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Cal_LD STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Cal_LD STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Cal_LD STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDR a2, =DCacheCleanAddress STR a2, [v6, #DCache_CleanBaseAddress] STR a2, [v6, #DCache_CleanNextAddress] [ XScaleMiniCache ! 1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable" LDR a2, =DCacheCleanAddress + 4 * 32*1024 STR a2, [v6, #MCache_CleanBaseAddress] STR a2, [v6, #MCache_CleanNextAddress] ] ; arbitrary-ish values, mini cache makes global op significantly more expensive [ XScaleMiniCache MOV a2, #128*1024 | MOV a2, #32*1024 ] STR a2, [v6, #DCache_RangeThreshold] ; enable full coprocessor access LDR a2, =&3FFF MCR p15, 0, a2, c15, c1 LDR a2, [v6, #ProcessorFlags] TST a2, #CPUFlag_ExtendedPages ADREQL a2, XCBTableXScaleNoExt ADRNEL a2, XCBTableXScaleWA ; choose between RA and WA here STR a2, [v6, #MMU_PCBTrans] B %FT90 [ MEMM_Type = "VMSAv6" Analyse_WB_CR7_Lx TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here ; Read the cache info into Cache_Lx_* MRC p15, 1, a1, c0, c0, 1 ; Cache level ID register MOV v2, v6 ; Work around DTable/ITable alignment issues STR a1, [v2, #Cache_Lx_Info]! ADD a2, v2, #Cache_Lx_DTable-Cache_Lx_Info MOV a3, #0 MOV a4, #256 ; Smallest instruction cache line length MOV v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length) 10 ANDS v1, a1, #6 ; Data or unified cache at this level? MCRNE p15, 2, a3, c0, c0, 0 ; Program cache size selection register myISB ,v1 MRCNE p15, 1, v1, c0, c0, 0 ; Get size info (data/unified) STR v1, [a2] AND v1, v1, #7 ; Get line size CMP v1, v2 MOVLT v2, v1 ADD a3, a3, #1 ANDS v1, a1, #1 ; Instruction cache at this level? MCRNE p15, 2, a3, c0, c0, 0 ; Program cache size selection register myISB ,v1 MRCNE p15, 1, v1, c0, c0, 0 ; Get size info (instruction) STR v1, [a2, #Cache_Lx_ITable-Cache_Lx_DTable] AND v1, v1, #7 ; Get line size CMP v1, a4 MOVLT a4, v1 ; Shift the cache level ID register along to get the type of the next ; cache level ; However, we need to stop once we reach the first blank entry, because ; ARM have been sneaky and started to reuse some of the bits from the ; high end of the register (the Cortex-A8 TRM lists bits 21-23 as being ; for cache level 8, but the ARMv7 ARM lists them as being for the level ; of unification for inner shareable memory). The ARMv7 ARM does warn ; about making sure you stop once you find the first blank entry, but ; it doesn't say why! TST a1, #7 ADD a3, a3, #1 MOVNE a1, a1, LSR #3 CMP a3, #14 ; Stop after level 7 (even though an 8th level might exist on some CPUs?) ADD a2, a2, #4 BLT %BT10 STRB a4, [v6, #ICache_LineLen] ; Store log2(line size)-2 STRB v2, [v6, #DCache_LineLen] ; log2(line size)-2 ; Calculate DCache_RangeThreshold MOV a1, #128*1024 ; Arbitrary-ish STR a1, [v6, #DCache_RangeThreshold] ADRL a1, Cache_CleanInvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_Lx STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_Lx STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_Lx STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] B %FT90 ] ; MEMM_Type = "VMSAv6" 90 Pull "v1,v2,v5,v6,v7,pc" ; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined ; in section B2.3.3 of the ARMv5 ARM. EvaluateCache AND a3, a1, #CT_assoc_mask+CT_M TEQ a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M BEQ %FT80 MOV ip, #1 ASSERT CT_len_pos = 0 AND a4, a1, #CT_len_mask ADD a4, a4, #3 MOV a4, ip, LSL a4 ; LineLen = 1 << (len+3) STRB a4, [a2, #ICache_LineLen-ICache_Info] MOV a3, #2 TST a1, #CT_M ADDNE a3, a3, #1 ; Multiplier = 2 + M AND a4, a1, #CT_assoc_mask RSB a4, ip, a4, LSR #CT_assoc_pos MOV a4, a3, LSL a4 ; Associativity = Multiplier << (assoc-1) STRB a4, [a2, #ICache_Associativity-ICache_Info] AND a4, a1, #CT_size_mask MOV a4, a4, LSR #CT_size_pos MOV a3, a3, LSL a4 MOV a3, a3, LSL #8 ; Size = Multiplier << (size+8) STR a3, [a2, #ICache_Size-ICache_Info] ADD a4, a4, #6 AND a3, a1, #CT_assoc_mask SUB a4, a4, a3, LSR #CT_assoc_pos AND a3, a1, #CT_len_mask ASSERT CT_len_pos = 0 SUB a4, a4, a3 MOV a4, ip, LSL a4 ; NSets = 1 << (size + 6 - assoc - len) STR a4, [a2, #ICache_NSets-ICache_Info] MOV pc, lr 80 MOV a1, #0 STR a1, [a2, #ICache_NSets-ICache_Info] STR a1, [a2, #ICache_Size-ICache_Info] STRB a1, [a2, #ICache_LineLen-ICache_Info] STRB a1, [a2, #ICache_Associativity-ICache_Info] MOV pc, lr ; Create a list of CPUs, 16 bytes per entry: ; ID bits (1 word) ; Test mask for ID (1 word) ; Cache type register value (1 word) ; Processor type (1 byte) ; Architecture type (1 byte) ; Reserved (2 bytes) GBLA tempcpu MACRO CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln LCLA type type SETA (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos) tempcpu CSzDesc $dsz, $das, $dln type SETA type+(tempcpu:SHL:CT_Dsize_pos) [ :LNOT:($s=0 :LAND: "$isz"="") tempcpu CSzDesc $isz, $ias, $iln ] type SETA type+(tempcpu:SHL:CT_Isize_pos) ASSERT ($id :AND: :NOT: $mask) = 0 DCD $id, $mask, type DCB $proc, $arch, 0, 0 MEND MACRO $var CSzDesc $sz, $as, $ln $var SETA (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos) $var SETA $var+(CT_M_$sz:SHL:CT_M_pos) MEND ; CPUDesc table for ARMv3-ARMv6 KnownCPUTable ; /------Cache Type register fields-----\. ; ID reg Mask Arch Type S Dsz Das Dln Isz Ias Iln CPUDesc ARM600, &000600, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM610, &000610, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARMunk, &000000, &00F000, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM700, &007000, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710, &007100, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710a, &047100, &FDFFF0, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM7500, &027100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARM7500FE, &077100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARMunk, &007000, &80F000, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM720T, &807200, &FFFFF0, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc ARMunk, &807000, &80F000, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA110, &01A100, &0FFFF0, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA1100, &01A110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc SA1110, &01B110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc ARM920T, &029200, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8 CPUDesc ARM922T, &029220, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 8K, 64, 8, 8K, 64, 8 CPUDesc X80200, &052000, &0FFFF0, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 CPUDesc X80321, &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 DCD -1 ; Simplified CPUDesc table for ARMvF ; The cache size data is ignored for ARMv7. KnownCPUTable_Fancy CPUDesc ARM1176JZF_S, &00B760, &00FFF0, ARMvF, WB_CR7_LDc, 1, 16K, 4, 8, 16K, 4, 8 CPUDesc Cortex_A5, &00C050, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32,16, 16K, 32,16 CPUDesc Cortex_A7, &00C070, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32,16, 16K, 32,16 CPUDesc Cortex_A8, &00C080, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32,16, 16K, 32,16 CPUDesc Cortex_A9, &00C090, &00FFF0, ARMvF, WB_CR7_Lx, 1, 32K, 32,16, 32K, 32,16 CPUDesc Cortex_A12, &00C0D0, &00FFF0, ARMvF, WB_CR7_Lx, 1, 32K, 32,16, 32K, 32,16 CPUDesc Cortex_A15, &00C0F0, &00FFF0, ARMvF, WB_CR7_Lx, 1, 32K, 32,16, 32K, 32,16 CPUDesc Cortex_A17, &00C0E0, &00FFF0, ARMvF, WB_CR7_Lx, 1, 32K, 32,16, 32K, 32,16 DCD -1 ; Peculiar characteristics of individual ARMs not deducable otherwise. First field is ; flags to set, second flags to clear. KnownCPUFlags DCD 0, 0 ; ARM 600 DCD 0, 0 ; ARM 610 DCD 0, 0 ; ARM 700 DCD 0, 0 ; ARM 710 DCD 0, 0 ; ARM 710a DCD CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay, 0 ; SA 110 pre revT DCD CPUFlag_InterruptDelay, 0 ; SA 110 revT or later DCD 0, 0 ; ARM 7500 DCD 0, 0 ; ARM 7500FE DCD CPUFlag_InterruptDelay, 0 ; SA 1100 DCD CPUFlag_InterruptDelay, 0 ; SA 1110 DCD CPUFlag_NoWBDrain, 0 ; ARM 720T DCD 0, 0 ; ARM 920T DCD 0, 0 ; ARM 922T DCD CPUFlag_ExtendedPages+CPUFlag_XScale, 0 ; X80200 DCD CPUFlag_XScale, 0 ; X80321 DCD 0, 0 ; ARM1176JZF_S DCD 0, 0 ; Cortex_A5 DCD 0, 0 ; Cortex_A7 DCD 0, 0 ; Cortex_A8 DCD 0, 0 ; Cortex_A9 DCD 0, 0 ; Cortex_A12 DCD 0, 0 ; Cortex_A15 DCD 0, 0 ; Cortex_A17 [ MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; ----- ARM_Analyse_Fancy -------------------------------------------------- ; -------------------------------------------------------------------------- ; ; For ARMv7 ARMs (arch=&F), we can detect everything via the feature registers ; TODO - There's some stuff in here that can be tidied up/removed ; Things we need to set up: ; ProcessorType (as listed in hdr.ARMops) ; Cache_Type (CT_ctype_* from hdr:MEMM.ARM600) ; ProcessorArch (as reported by Init_ARMarch) ; ProcessorFlags (CPUFlag_* from hdr.ARMops) ; Proc_* (Cache/TLB/IMB/MMU function pointers) ; MMU_PCBTrans (Points to lookup table for translating page table cache options) ; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?) ARM_Analyse_Fancy Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 LDR v6, =ZeroPage ADRL v7, KnownCPUTable_Fancy 10 LDMIA v7!, {a1, a2} CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE %BT10 20 LDR v2, [v7] CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] MOV v5, #CPUFlag_32bitOS+CPUFlag_No26bitMode ; 26bit has been obsolete for a long time [ HiProcVecs ORR v5, v5, #CPUFlag_HiProcVecs ] ; Work out whether the cache info is in ARMv6 or ARMv7 style MRC p15, 0, a1, c0, c0, 1 TST a1, #&80000000 BNE %FT25 ; ARMv6 format cache type register. ; CPUs like the ARM1176JZF-S are available with a range of cache sizes, ; so it's not safe to rely on the values in the CPU table. Fortunately ; all ARMv6 CPUs implement the register (by contrast, for the "plain" ; ARM case, no ARMv3 CPUs, some ARMv4 CPUs and all ARMv5 CPUs, so it ; needs to drop back to the table in some cases). ARM_read_cachetype v2 MOV a1, v2, LSR #CT_Isize_pos ADD a2, v6, #ICache_Info BL EvaluateCache MOV a1, v2, LSR #CT_Dsize_pos ADD a2, v6, #DCache_Info BL EvaluateCache TST v2, #CT_S ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache B %FT27 25 ; ARMv7 format cache type register. ; This should(!) mean that we have the cache level ID register, ; and all the other ARMv7 cache registers. ; Do we have a split cache? MRC p15, 1, a1, c0, c0, 1 AND a2, a1, #7 TEQ a2, #3 ORREQ v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache 27 [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] MRC p15, 0, a1, c0, c2, 2 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_LongMul MRC p15, 0, a1, c0, c1, 0 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_Thumb MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this? ; Other flags not checked for above: ; CPUFlag_InterruptDelay ; CPUFlag_VectorReadException ; CPUFlag_ExtendedPages ; CPUFlag_NoWBDrain ; CPUFlag_AbortRestartBroken ; CPUFlag_XScale ; CPUFlag_XScaleJTAGconnected LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 STR v5, [v6, #ProcessorFlags] ; Cache analysis LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa ; eg. ARM9 TEQNE a2, #CT_ctype_WB_CR7_LDc ; eg. ARM1176JZF-S - differs only in cache lockdown BEQ Analyse_WB_CR7_LDa TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD ; warning, allocation clash with CT_ctype_WB_CR7_LDd BEQ Analyse_WB_Cal_LD ; assume XScale TEQ a2, #CT_ctype_WB_CR7_Lx BEQ Analyse_WB_CR7_Lx ; eg. Cortex-A8, Cortex-A9 ; others ... B WeirdARMPanic ; stiff :) ] ; MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; ----- ARMops ------------------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMops are the routines required by the kernel for cache/MMU control ; the kernel vectors to the appropriate ops for the given ARM at boot ; ; The Rules: ; - These routines may corrupt a1 and lr only ; - (lr can of course only be corrupted whilst still returning to correct ; link address) ; - stack is available, at least 16 words can be stacked ; - a NULL op would be a simple MOV pc, lr ; ; -------------------------------------------------------------------------- ; ----- ARMops for ARMv3 --------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMv3 ARMs include ARM710, ARM610, ARM7500 ; Cache_Invalidate_ARMv3 MCR p15, 0, a1, c7, c0 NullOp MOV pc, lr WriteBuffer_Drain_ARMv3 ;swap always forces unbuffered write, stalling till WB empty SUB sp, sp, #4 SWP a1, a1, [sp] ADD sp, sp, #4 MOV pc, lr TLB_Invalidate_ARMv3 MCR p15, 0, a1, c5, c0 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_ARMv3 MCR p15, 0, a1, c6, c0 MOV pc, lr MMU_Changing_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr MMU_ChangingUncached_ARMv3 MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_ARMv3 Push "a2" MCR p15, 0, a1, c7, c0 ; invalidate cache 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry_ARMv3 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached_ARMv3 Push "a2" 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_ARMv3 ! 0, "arbitrary Cache_RangeThreshold_ARMv3" MOV a1, #16*PageSize MOV pc, lr LTORG ; -------------------------------------------------------------------------- ; ----- generic ARMops for simple ARMs, ARMv4 onwards ---------------------- ; -------------------------------------------------------------------------- ; ; eg. ARM7TDMI based ARMs, unified, writethrough cache ; Cache_InvalidateUnified MOV a1, #0 MCR p15, 0, a1, c7, c7 MOV pc, lr WriteBuffer_Drain_OffOn ; used if ARM has no drain WBuffer MCR op Push "a2" ARM_read_control a1 BIC a2, a1, #MMUC_W ARM_write_control a2 ARM_write_control a1 Pull "a2" MOV pc, lr WriteBuffer_Drain ; used if ARM has proper drain WBuffer MCR op MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 MOV pc, lr TLB_Invalidate_Unified MOV a1, #0 MCR p15, 0, a1, c8, c7 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_Unified MCR p15, 0, a1, c8, c7, 1 MOV pc, lr MMU_Changing_Writethrough MOV a1, #0 MCR p15, 0, a1, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr MMU_ChangingUncached MOV a1, #0 MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_Writethrough Push "a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry Pull "a4" MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_Writethrough ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_Writethrough Push "a2,a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2,a4" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached Push "a2" 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_Writethrough ! 0, "arbitrary Cache_RangeThreshold_Writethrough" MOV a1, #16*PageSize MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for ARM9 and the like --------------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with ; register 7, lockdown available (format A) ; ; Note that ARM920 etc have writeback/writethrough data cache selectable ; by MMU regions. For simpliciity, we assume cacheable pages are mostly ; writeback. Any writethrough pages will have redundant clean operations ; applied when moved, for example, but this is a small overhead (cleaning ; a clean line is very quick on ARM 9). Cache_CleanAll_WB_CR7_LDa ROUT ; ; only guarantees to clean lines not involved in interrupts (so we can ; clean without disabling interrupts) ; ; Clean cache by traversing all segment and index values ; As a concrete example, for ARM 920 (16k+16k caches) we would have: ; ; DCache_LineLen = 32 (32 byte cache line, segment field starts at bit 5) ; DCache_IndexBit = &04000000 (index field starts at bit 26) ; DCache_IndexSegStart = &000000E0 (start at index=0, segment = 7) ; Push "a2, ip" LDR ip, =ZeroPage LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c10, 2 ; clean DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer Pull "a2, ip" MOV pc, lr Cache_CleanInvalidateAll_WB_CR7_LDa ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "a2, ip" LDR ip, =ZeroPage LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c14, 2 ; clean&invalidate DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer MCR p15, 0, ip, c7, c5, 0 ; invalidate ICache Pull "a2, ip" MOV pc, lr Cache_InvalidateAll_WB_CR7_LDa ROUT ; ; no clean, assume caller knows what's happening ; MOV a1, #0 MCR p15, 0, a1, c7, c7, 0 ; invalidate ICache and DCache MOV pc, lr Cache_RangeThreshold_WB_CR7_LDa ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_CR7_LDa ROUT MMU_ChangingUncached_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; a1 = page affected (page aligned address) ; TLB_InvalidateEntry_WB_CR7_LDa ROUT MMU_ChangingUncachedEntry_WB_CR7_LDa MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MOV pc, lr WriteBuffer_Drain_WB_CR7_LDa ROUT MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV pc, lr IMB_Full_WB_CR7_LDa ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache ; Push "lr" BL Cache_CleanAll_WB_CR7_LDa ; also drains Wbuffer MOV a1, #0 MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_LDa ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_CR7_LDa Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry by VA MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MCR p15, 0, a1, c7, c5, 6 ; flush branch predictors Pull "pc" MMU_Changing_WB_CR7_LDa ROUT Push "lr" BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_LDa ROUT Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV lr, #0 MCR p15, 0, lr, c7, c10, 4 ; drain WBuffer MCR p15, 0, a1, c7, c5, 6 ; flush branch predictors SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_LDa ROUT Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MCR p15, 0, a1, c7, c5, 6 ; flush branch predictors MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "a2" 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 Pull "a2" MOV pc, lr ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for StrongARM and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_Crd is Writeback data cache, clean by reading data from cleaner area ; Currently no support for mini data cache on some StrongARM variants. Mini ; cache is always writeback and must have cleaning support, so is very ; awkward to use for cacheable screen, say. ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; DCache_CleanBaseAddress : start address of total cleaner space ; DCache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered Cache_CleanAll_WB_Crd ROUT ; ; - cleans data cache (and invalidates it as a side effect) ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of StrongARM ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "a2-a4, v1, v2, pc" Cache_CleanInvalidateAll_WB_Crd ROUT IMB_Full_WB_Crd ; ;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not ;involved in interrupts - this is sufficient for OS requirements, and means we don't ;have to disable interrupts for possibly slow clean ; Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" Cache_InvalidateAll_WB_Crd ; ; no clean, assume caller knows what is happening ; MCR p15, 0, a1, c7, c7, 0 ;flush ICache and DCache MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr Cache_RangeThreshold_WB_Crd LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Crd MMU_ChangingUncached_WB_Crd MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr TLB_InvalidateEntry_WB_Crd MMU_ChangingUncachedEntry_WB_Crd MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB MOV pc, lr WriteBuffer_Drain_WB_Crd MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr IMB_Range_WB_Crd ROUT SUB a2, a2, a1 CMP a2, #64*1024 ;arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Crd Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" MMU_Changing_WB_Crd Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "pc" MMU_ChangingEntry_WB_Crd ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 SUB a1, a1, #PageSize MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, pc" MMU_ChangingEntries_WB_Crd ROUT ; ;same comments as MMU_ChangingEntry_WB_Crd ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MOV a1, lr ;restore start address 20 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, a3, pc" ; 30 BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Crd ROUT CMP a2, #32 ;arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize SUBS lr, lr, #1 BNE %BT10 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr ; ARMops for XScale, mjs Feb 2001 ; ; WB_Cal_LD is writeback, clean with allocate, lockdown ; ; If the mini data cache is used (XScaleMiniCache true), it is assumed to be ; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow ; mini cache clean for things like IMB_Full. ; ; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the ; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts ; off (else the main cache may be slightly dirty from interrupts, and the invalidate ; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow ; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate ; (and perhaps a line allocate) for the mini cache. ; ; We do not use lockdown. ; ; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough ; pages will be invalidated as if they were writeback, but there is little overhead ; (cleaning a clean line or allocating a line from cleaner area are both fast). ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; If the mini cache is used, it has its own equivalent cleaner space and algorithm. ; Parameters for each cache are: ; ; Cache_CleanBaseAddress : start address of total cleaner space ; Cache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered GBLL XScaleMiniCache ; *must* be configured writethrough if used XScaleMiniCache SETL {FALSE} ; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect ; corrupts a1 ; MACRO CPWAIT MRC p15, 0, a1, c2, c0, 0 ; arbitrary read of CP15 MOV a1, a1 ; wait for it ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller MEND Cache_CleanAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates as a side effect) ; - if mini cache is in use, will be writethrough so no clean required ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of XScale ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =ZeroPage+DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 MCR p15, 0, v1, c7, c2, 5 ; allocate address from cleaner space ADD v1, v1, a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "a2-a4, v1, v2, pc" [ XScaleMiniCache Cache_MiniInvalidateAll_WB_Cal_LD ROUT ; ; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and ; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough) ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #MCache_CleanBaseAddress] LDR a2, =ZeroPage+MCache_CleanNextAddr LDR a3, [lr, #MCache_Size] LDRB a4, [lr, #MCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 ; read a line of cleaner data TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next ; note, no drain WBuffer, since we are really only invalidating a writethrough cache Pull "a2-a4, v1, v2, pc" ] ; XScaleMiniCache Cache_CleanInvalidateAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates wrt OS stuff as a side effect) ; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect ; Push "lr" BL Cache_CleanAll_WB_Cal_LD [ XScaleMiniCache BL Cache_MiniInvalidateAll_WB_Cal_LD ] MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" Cache_InvalidateAll_WB_Cal_LD ROUT ; ; no clean, assume caller knows what's happening ; MCR p15, 0, a1, c7, c7, 0 ; invalidate DCache, (MiniCache), ICache and BTB CPWAIT MOV pc, lr Cache_RangeThreshold_WB_Cal_LD ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Cal_LD ROUT MMU_ChangingUncached_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr TLB_InvalidateEntry_WB_Cal_LD ROUT MMU_ChangingUncachedEntry_WB_Cal_LD MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry CPWAIT MOV pc, lr WriteBuffer_Drain_WB_Cal_LD ROUT MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) MOV pc, lr IMB_Full_WB_Cal_LD Push "lr" BL Cache_CleanAll_WB_Cal_LD ; clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" IMB_Range_WB_Cal_LD ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Cal_LD Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "pc" MMU_Changing_WB_Cal_LD ROUT Push "lr" BL Cache_CleanAll_WB_Cal_LD MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "pc" MMU_ChangingEntry_WB_Cal_LD ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry CPWAIT Pull "a2, pc" MMU_ChangingEntries_WB_Cal_LD ROUT ; ;same comments as MMU_ChangingEntry_WB_Cal_LD ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 CPWAIT Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Cal_LD ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry SUBS lr, lr, #1 ADD a1, a1, #PageSize BNE %BT10 CPWAIT Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr [ MEMM_Type = "VMSAv6" ; Need appropriate myIMB, etc. implementations if this is to be removed ; -------------------------------------------------------------------------- ; ----- ARMops for Cortex-A8 and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with ; register 7, and (potentially) multiple cache levels ; ; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length ; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length ; DCache_RangeThreshold = clean threshold for data cache ; Cache_Lx_Info = Cache level ID register ; Cache_Lx_DTable = Cache size identification register for all 7 data/unified caches ; Cache_Lx_ITable = Cache size identification register for all 7 instruction caches ; ARMv7 cache maintenance routines are a bit long-winded, so we use this macro ; to reduce the risk of mistakes creeping in due to code duplication ; ; $op: Operation to perform ('clean', 'invalidate', 'cleaninvalidate') ; $levels: Which levels to apply to ('lou', 'loc', 'louis') ; Uses r0-r8 & lr as temp ; Performs the indicated op on the indicated data & unified caches ; ; Code based around the alternate/faster code given in the ARMv7 ARM (section ; B2.2.4, alternate/faster code only in doc revision 9), but tightened up a bit ; ; Note that HAL_InvalidateCache_ARMvF uses its own implementation of this ; algorithm, since it must cope with different temporary registers and it needs ; to read the cache info straight from the CP15 registers ; MACRO MaintainDataCache_WB_CR7_Lx $op, $levels LDR lr, =ZeroPage LDR r0, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info [ "$levels"="lou" ANDS r3, r0, #&38000000 MOV r3, r3, LSR #26 ; Cache level value (naturally aligned) | [ "$levels"="loc" ANDS r3, r0, #&07000000 MOV r3, r3, LSR #23 ; Cache level value (naturally aligned) | [ "$levels"="louis" ANDS r3, r0, #&00E00000 MOV r3, r3, LSR #20 ; Cache level value (naturally aligned) | ! 1, "Unrecognised levels" ] ] ] BEQ %FT50 MOV r8, #0 ; Current cache level 10 ; Loop1 ADD r2, r8, r8, LSR #1 ; Work out 3 x cachelevel MOV r1, r0, LSR r2 ; bottom 3 bits are the Cache type for this level AND r1, r1, #7 ; get those 3 bits alone CMP r1, #2 BLT %FT40 ; no cache or only instruction cache at this level LDR r1, [lr, r8, LSL #1] ; read CCSIDR to r1 AND r2, r1, #&7 ; extract the line length field ADD r2, r2, #4 ; add 4 for the line length offset (log2 16 bytes) LDR r7, =&3FF AND r7, r7, r1, LSR #3 ; r7 is the max number on the way size (right aligned) CLZ r5, r7 ; r5 is the bit position of the way size increment LDR r4, =&7FFF AND r4, r4, r1, LSR #13 ; r4 is the max number of the index size (right aligned) 20 ; Loop2 MOV r1, r4 ; r1 working copy of the max index size (right aligned) 30 ; Loop3 ORR r6, r8, r7, LSL r5 ; factor in the way number and cache number into r6 ORR r6, r6, r1, LSL r2 ; factor in the index number [ "$op"="clean" MCR p15, 0, r6, c7, c10, 2 ; Clean | [ "$op"="invalidate" MCR p15, 0, r6, c7, c6, 2 ; Invalidate | [ "$op"="cleaninvalidate" MCR p15, 0, r6, c7, c14, 2 ; Clean & invalidate | ! 1, "Unrecognised op" ] ] ] SUBS r1, r1, #1 ; decrement the index BGE %BT30 SUBS r7, r7, #1 ; decrement the way number BGE %BT20 40 ; Skip ADD r8, r8, #2 CMP r3, r8 BGT %BT10 myDSB ,r0 50 ; Finished MEND Cache_CleanAll_WB_CR7_Lx ROUT ; Clean cache by traversing all sets and ways for all data caches Push "r1-r8,lr" MaintainDataCache_WB_CR7_Lx clean, loc Pull "r1-r8,pc" Cache_CleanInvalidateAll_WB_CR7_Lx ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "r1-r8,lr" MaintainDataCache_WB_CR7_Lx cleaninvalidate, loc MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "r1-r8,pc" Cache_InvalidateAll_WB_CR7_Lx ROUT ; ; no clean, assume caller knows what's happening ; Push "r1-r8,lr" MaintainDataCache_WB_CR7_Lx invalidate, loc MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "r1-r8,pc" Cache_RangeThreshold_WB_CR7_Lx ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr MMU_ChangingUncached_WB_CR7_Lx myDSB ,a1 ; Ensure the page table write has actually completed myISB ,a1,,y ; Also required TLB_InvalidateAll_WB_CR7_Lx ROUT MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry_WB_CR7_Lx [ NoARMv7 Push "a2" myDSB ,a2 ; Ensure the page table write has actually completed myISB ,a2,,y ; Also required Pull "a2" | myDSB myISB ] TLB_InvalidateEntry_WB_CR7_Lx ROUT MCR p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible MOV pc, lr WriteBuffer_Drain_WB_CR7_Lx ROUT myDSB ,a1 ; DSB is the new name for write buffer draining myISB ,a1,,y ; Also do ISB for extra paranoia MOV pc, lr IMB_Full_WB_CR7_Lx ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor ; Luckily, we only need to clean as far as the level of unification ; Push "r1-r8,lr" MaintainDataCache_WB_CR7_Lx clean, lou MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "r1-r8,pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_Lx ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take ADD a2, a2, a1 CMPLO a1, a2 ; The routine below will fail if the end address wraps around, so just IMB_Full instead BHS IMB_Full_WB_CR7_Lx Push "a1,a3,lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a3, #4 MOV lr, a3, LSL lr 10 MCR p15, 0, a1, c7, c11, 1 ; clean DCache entry by VA to PoU ADD a1, a1, lr CMP a1, a2 BLO %BT10 myDSB ,a1 ; Wait for clean to complete Pull "a1" ; Get start address back LDR lr, =ZeroPage LDRB lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV lr, a3, LSL lr 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "a3,pc" MMU_Changing_WB_CR7_Lx ROUT Push "lr" myDSB ,a1 ; Ensure the page table write has actually completed myISB ,a1,,y ; Also required BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB myDSB ,a1,,y ; Wait for TLB invalidation to complete myISB ,a1,,y ; Ensure that the effects are visible Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_Lx ROUT Push "a2, lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a2, #4 MOV lr, a2, LSL lr ADD a2, a1, #PageSize 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC ADD a1, a1, lr CMP a1, a2 BNE %BT10 myDSB ,lr ; Wait for clean to complete LDR lr, =ZeroPage LDRB lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV a1, #4 MOV lr, a1, LSL lr SUB a1, a2, #PageSize ; Get start address back 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, lr CMP a1, a2 BNE %BT10 SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB and ITLB MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 myISB ,a1,,y Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_Lx ROUT Push "a2, a3, lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] ; log2(line len)-2 MOV lr, #4 MOV a3, lr, LSL a3 MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC ADD a1, a1, a3 CMP a1, a2 BNE %BT10 myDSB ,a3 ; Wait for clean to complete LDR a3, =ZeroPage LDRB a3, [a3, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV a1, #4 MOV a3, a1, LSL a3 MOV a1, lr ; Get start address back 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, a3 CMP a1, a2 BNE %BT10 20 MCR p15, 0, lr, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD lr, lr, #PageSize CMP lr, a2 BNE %BT20 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 myISB ,a1,,y Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB myDSB ,a1,,y ; Wait TLB invalidation to complete myISB ,a1,,y ; Ensure that the effects are visible Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT Push "a2,lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required CMP a2, #32 ; arbitrary-ish threshold MCRHS p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB BHS %FT20 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 20 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,lr,,y myISB ,lr,,y Pull "a2,pc" ] ; MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; IMPORT Write0_Translated ARM_PrintProcessorType LDR a1, =ZeroPage LDRB a1, [a1, #ProcessorType] TEQ a1, #ARMunk MOVEQ pc, lr Push "lr" ADR a2, PNameTable LDHA a1, a2, a1, a3 ADD a1, a2, a1 [ International BL Write0_Translated | SWI XOS_Write0 ] SWI XOS_NewLine SWI XOS_NewLine Pull "pc" PNameTable DCW PName_ARM600 - PNameTable DCW PName_ARM610 - PNameTable DCW PName_ARM700 - PNameTable DCW PName_ARM710 - PNameTable DCW PName_ARM710a - PNameTable DCW PName_SA110 - PNameTable ; pre rev T DCW PName_SA110 - PNameTable ; rev T or later DCW PName_ARM7500 - PNameTable DCW PName_ARM7500FE - PNameTable DCW PName_SA1100 - PNameTable DCW PName_SA1110 - PNameTable DCW PName_ARM720T - PNameTable DCW PName_ARM920T - PNameTable DCW PName_ARM922T - PNameTable DCW PName_X80200 - PNameTable DCW PName_X80321 - PNameTable DCW PName_ARM1176JZF_S - PNameTable DCW PName_Cortex_A5 - PNameTable DCW PName_Cortex_A7 - PNameTable DCW PName_Cortex_A8 - PNameTable DCW PName_Cortex_A9 - PNameTable DCW PName_Cortex_A17 - PNameTable ; A12 rebranded as A17 DCW PName_Cortex_A15 - PNameTable DCW PName_Cortex_A17 - PNameTable PName_ARM600 = "600:ARM 600 Processor",0 PName_ARM610 = "610:ARM 610 Processor",0 PName_ARM700 = "700:ARM 700 Processor",0 PName_ARM710 = "710:ARM 710 Processor",0 PName_ARM710a = "710a:ARM 710a Processor",0 PName_SA110 = "SA110:SA-110 Processor",0 PName_ARM7500 = "7500:ARM 7500 Processor",0 PName_ARM7500FE = "7500FE:ARM 7500FE Processor",0 PName_SA1100 = "SA1100:SA-1100 Processor",0 PName_SA1110 = "SA1110:SA-1110 Processor",0 PName_ARM720T = "720T:ARM 720T Processor",0 PName_ARM920T = "920T:ARM 920T Processor",0 PName_ARM922T = "922T:ARM 922T Processor",0 PName_X80200 = "X80200:80200 Processor",0 PName_X80321 = "X80321:80321 Processor",0 PName_ARM1176JZF_S = "ARM1176JZF_S:ARM1176JZF-S Processor",0 PName_Cortex_A5 = "CA5:Cortex-A5 Processor",0 PName_Cortex_A7 = "CA7:Cortex-A7 Processor",0 PName_Cortex_A8 = "CA8:Cortex-A8 Processor",0 PName_Cortex_A9 = "CA9:Cortex-A9 Processor",0 PName_Cortex_A15 = "CA15:Cortex-A15 Processor",0 PName_Cortex_A17 = "CA17:Cortex-A17 Processor",0 ALIGN ; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0) ; to XCB bits in page table descriptors. XCB_CB * 0:SHL:0 XCB_NB * 1:SHL:0 XCB_NC * 1:SHL:1 XCB_P * 1:SHL:2 ALIGN 32 ; WT read-allocate cache (eg ARM720T) XCBTableWT ; C+B CNB NCB NCNB = L2_C+L2_B, L2_C, L2_B, 0 ; Default = L2_C+L2_B, L2_C, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X ; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB XCBTableSA110 ; C+B CNB NCB NCNB = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T) XCBTableWBR ; C+B CNB NCB NCNB = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; SA-1110 - WB only read allocate cache, merging WB, mini D-cache XCBTableSA1110 ; C+B CNB NCB NCNB = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, 0, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to read-allocate XCBTableXScaleRA ; C+B CNB NCB NCNB = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to write-allocate XCBTableXScaleWA ; C+B CNB NCB NCNB = L2_X+L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read-allocate cache, merging WB, no mini D-cache/extended pages XCBTableXScaleNoExt ; C+B CNB NCB NCNB = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, 0, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X END