; Copyright 2000 Pace Micro Technology plc ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; GET Hdr:ListOpts ; GET Hdr:Macros ; GET Hdr:System ; $GetCPU ; $GetMEMM ; GET hdr.Options ; GET Hdr:PublicWS ; GET Hdr:KernelWS ; GET hdr.Copro15ops ; GET hdr.ARMops v7 RN 10 ; EXPORT Init_ARMarch ; EXPORT ARM_Analyse ; EXPORT ARM_PrintProcessorType ; AREA KernelCode,CODE,READONLY ; ARM keep changing their mind about ID field layout. ; Here's a summary, courtesy of the ARM ARM (v5): ; ; pre-ARM 7: xxxx0xxx ; ARM 7: xxxx7xxx where bit 23 indicates v4T/~v3 ; post-ARM 7: xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T) ; ; int Init_ARMarch(void) ; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later. ; Corrupts only ip, no RAM usage. Init_ARMarch ARM_read_ID ip ANDS a1, ip, #&0000F000 MOVEQ pc, lr ; ARM 3 or ARM 6 TEQ a1, #&00007000 BNE %FT20 TST ip, #&00800000 ; ARM 7 - check for Thumb MOVNE a1, #ARMv4T MOVEQ a1, #ARMv3 MOV pc, lr 20 ANDS a1, ip, #&000F0000 ; post-ARM 7 MOV a1, a1, LSR #16 MOV pc, lr ARM_Analyse MOV a2, lr BL Init_ARMarch MOV lr, a2 [ MEMM_Type = "VMSAv6" CMP a1, #ARMvF BEQ ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup ] Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 ARM_read_cachetype v2 LDR v6, =ZeroPage ADRL v7, KnownCPUTable FindARMloop LDMIA v7!, {a1, a2} ; See if it's a known ARM CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE FindARMloop TEQ v2, v1 ; If we don't have cache attributes, read from table LDREQ v2, [v7] 20 TEQ v2, v1 BEQ %BT20 ; Cache unknown: panic CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] ASSERT CT_Isize_pos = 0 MOV a1, v2 ADD a2, v6, #ICache_Info BL EvaluateCache MOV a1, v2, LSR #CT_Dsize_pos ADD a2, v6, #DCache_Info BL EvaluateCache AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] [ No26bitCode MOV v5, #CPUFlag_32bitOS | MOV v5, #0 ] [ HiProcVecs ORR v5, v5, #CPUFlag_HiProcVecs ] TST v2, #CT_S ORRNE v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 [ 0=1 ; Check whether 26-bit mode is available MSR CPSR_c, #F32_bit+I32_bit+SVC26_mode MRS a1, CPSR AND a1, a1, #M32_bits TEQ a1, #SVC26_mode ORRNE v5, v5, #CPUFlag_No26bitMode MSREQ CPSR_c, #F32_bit+I32_bit+SVC32_mode BNE %FT35 ; Do we get vector exceptions on read? LDR a2, =ZeroPage MOV a1, a2 LDR a1, [a1] ; If this aborts a1 will be left unchanged TEQ a1, a2 ORREQ v5, v5, #CPUFlag_VectorReadException ] 35 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] TEQ a1, #ARMv3 ; assume long multiply available ORRNE v5, v5, #CPUFlag_LongMul ; if v4 or later TEQNE a1, #ARMv4 ; assume 26-bit available ORRNE v5, v5, #CPUFlag_No26bitMode ; iff v3 or v4 (not T) TEQNE a1, #ARMv5 ; assume Thumb available ORRNE v5, v5, #CPUFlag_Thumb ; iff not v3,v4,v5 MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 [ XScaleJTAGDebug TST v5, #CPUFlag_XScale BEQ %FT40 MRC p14, 0, a2, c10, c0 ; Read debug control register TST a2, #&80000000 ORRNE v5, v5, #CPUFlag_XScaleJTAGconnected MOVEQ a2, #&C000001C ; enable hot debug MCREQ p14, 0, a2, c10, c0 BNE %FT40 40 ] STR v5, [v6, #ProcessorFlags] ; Now, a1 = processor architecture (ARMv3, ARMv4 ...) ; v4 = processor type (ARM600, ARM610, ...) ; v5 = processor flags CMP a1, #ARMv4 BLO Analyse_ARMv3 ; eg. ARM710 LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa BEQ Analyse_WB_CR7_LDa ; eg. ARM9 TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD BEQ Analyse_WB_Cal_LD ; assume XScale ; others ... WeirdARMPanic B WeirdARMPanic ; stiff :) Analyse_ARMv3 ADRL a1, NullOp ADRL a2, Cache_Invalidate_ARMv3 ADRL a3, WriteBuffer_Drain_ARMv3 ADRL a4, TLB_Invalidate_ARMv3 ADRL ip, TLB_InvalidateEntry_ARMv3 STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_ARMv3 ADRL a2, MMU_ChangingEntry_ARMv3 ADRL a3, MMU_ChangingUncached_ARMv3 ADRL a4, MMU_ChangingUncachedEntry_ARMv3 STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_ARMv3 ADRL a2, MMU_ChangingUncachedEntries_ARMv3 ADRL a3, Cache_RangeThreshold_ARMv3 STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WriteThroughUnified ADRL a1, NullOp ADRL a2, Cache_InvalidateUnified TST v5, #CPUFlag_NoWBDrain ADRNEL a3, WriteBuffer_Drain_OffOn ADREQL a3, WriteBuffer_Drain ADRL a4, TLB_Invalidate_Unified ADRL ip, TLB_InvalidateEntry_Unified STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_Writethrough ADRL a2, MMU_ChangingEntry_Writethrough ADRL a3, MMU_ChangingUncached ADRL a4, MMU_ChangingUncachedEntry STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_Writethrough ADRL a2, MMU_ChangingUncachedEntries ADRL a3, Cache_RangeThreshold_Writethrough STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_CR7_LDa TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here (eg. ARM920) ADRL a1, Cache_CleanInvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_LDa STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_LDa STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_LDa STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDRB a2, [v6, #DCache_Associativity] MOV a3, #256 MOV a4, #8 ; to find log2(ASSOC), rounded up Analyse_WB_CR7_LDa_L1 MOV a3, a3, LSR #1 SUB a4, a4, #1 CMP a2, a3 BLO Analyse_WB_CR7_LDa_L1 ADDHI a4, a4, #1 RSB a2, a4, #32 MOV a3, #1 MOV a3, a3, LSL a2 STR a3, [v6, #DCache_IndexBit] LDR a4, [v6, #DCache_NSets] LDRB a2, [v6, #DCache_LineLen] SUB a4, a4, #1 MUL a4, a2, a4 STR a4, [v6, #DCache_IndexSegStart] MOV a2, #64*1024 ; arbitrary-ish STR a2, [v6, #DCache_RangeThreshold] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_Crd TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Crd STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Crd STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Crd STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Crd STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Crd STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDR a2, =DCacheCleanAddress STR a2, [v6, #DCache_CleanBaseAddress] STR a2, [v6, #DCache_CleanNextAddress] MOV a2, #64*1024 ;arbitrary-ish threshold STR a2, [v6, #DCache_RangeThreshold] LDRB a2, [v6, #ProcessorType] TEQ a2, #SA110 ADREQL a2, XCBTableSA110 BEQ Analyse_WB_Crd_finish TEQ a2, #SA1100 TEQNE a2, #SA1110 ADREQL a2, XCBTableSA1110 ADRNEL a2, XCBTableWBR Analyse_WB_Crd_finish STR a2, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_Cal_LD TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Cal_LD STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Cal_LD STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Cal_LD STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Cal_LD STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Cal_LD STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] LDR a2, =DCacheCleanAddress STR a2, [v6, #DCache_CleanBaseAddress] STR a2, [v6, #DCache_CleanNextAddress] [ XScaleMiniCache ! 1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable" LDR a2, =DCacheCleanAddress + 4 * 32*1024 STR a2, [v6, #MCache_CleanBaseAddress] STR a2, [v6, #MCache_CleanNextAddress] ] ; arbitrary-ish values, mini cache makes global op significantly more expensive [ XScaleMiniCache MOV a2, #128*1024 | MOV a2, #32*1024 ] STR a2, [v6, #DCache_RangeThreshold] ; enable full coprocessor access LDR a2, =&3FFF MCR p15, 0, a2, c15, c1 ADRL a2, XCBTableXScaleWA ; choose between RA and WA here STR a2, [v6, #MMU_PCBTrans] B %FT90 [ MEMM_Type = "VMSAv6" Analyse_WB_CR7_Lx TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here ; Read the cache info into Cache_Lx_* MRC p15, 1, a1, c0, c0, 1 ; Cache level ID register MOV v2, v6 ; Work around DTable/ITable alignment issues STR a1, [v2, #Cache_Lx_Info]! ADD a1, v2, #Cache_Lx_DTable-Cache_Lx_Info ADD a2, v2, #Cache_Lx_ITable-Cache_Lx_Info MOV a3, #0 MOV a4, #256 ; Smallest instruction cache line length MOV v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length) 10 MCR p15, 2, a3, c0, c0, 0 ; Program cache size selection register MRC p15, 1, v1, c0, c0, 0 ; Get size info (data/unified) STR v1, [a1],#4 CMP v1, #0 ; Does the cache exist? AND v1, v1, #7 ; Get line size CMPNE v1, v2 MOVLT v2, v1 ; Earlier CMP will not set LE flags if v1=0 ADD a3, a3, #1 MCR p15, 2, a3, c0, c0, 0 ; Program cache size selection register MRC p15, 1, v1, c0, c0, 0 ; Get size info (instruction) STR v1, [a2],#4 CMP v1, #0 ; Does the cache exist? AND v1, v1, #7 ; Get line size CMPNE v1, a4 MOVLT a4, v1 ; Earlier CMP will not set LE flags if v1=0 ADD a3, a3, #1 CMP a3, #16 BLT %BT10 STRB a4, [v6, #ICache_LineLen] ; Store log2(line size)-2 STRB v2, [v6, #DCache_LineLen] ; log2(line size)-2 ; Calculate DCache_RangeThreshold MOV a1, #128*1024 ; Arbitrary-ish STR a1, [v6, #DCache_RangeThreshold] ADRL a1, Cache_CleanInvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_Lx STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_Lx STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_Lx STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] ; Enable L2 cache. This could probably be moved earlier on in the boot sequence (e.g. when the MMU is turned on), but for now it will go here to reduce the chances of stuff breaking BL Cache_CleanInvalidateAll_WB_CR7_Lx ; Ensure L2 cache is clean MRC p15, 0, a1, c1, c0, 1 ORR a1, a1, #2 ; L2EN MCR p15, 0, a1, c1, c0, 1 B %FT90 ] ; MEMM_Type = "VMSAv6" 90 Pull "v1,v2,v5,v6,v7,pc" ; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined in section ; B2.3.3 of the ARMv5 ARM. EvaluateCache AND a3, a1, #CT_assoc_mask+CT_M TEQ a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M BEQ %FT80 MOV ip, #1 ASSERT CT_len_pos = 0 AND a4, a1, #CT_len_mask ADD a4, a4, #3 MOV a4, ip, LSL a4 ; LineLen = 1 << (len+3) STRB a4, [a2, #ICache_LineLen-ICache_Info] MOV a3, #2 TST a1, #CT_M ADDNE a3, a3, #1 ; Multiplier = 2 + M AND a4, a1, #CT_assoc_mask RSB a4, ip, a4, LSR #CT_assoc_pos MOV a4, a3, LSL a4 ; Associativity = Multiplier << (assoc-1) STRB a4, [a2, #ICache_Associativity-ICache_Info] AND a4, a1, #CT_size_mask MOV a4, a4, LSR #CT_size_pos MOV a3, a3, LSL a4 MOV a3, a3, LSL #8 ; Size = Multiplier << (size+8) STR a3, [a2, #ICache_Size-ICache_Info] ADD a4, a4, #6 AND a3, a1, #CT_assoc_mask SUB a4, a4, a3, LSR #CT_assoc_pos AND a3, a1, #CT_len_mask ASSERT CT_len_pos = 0 SUB a4, a4, a3 MOV a4, ip, LSL a4 ; NSets = 1 << (size + 6 - assoc - len) STR a4, [a2, #ICache_NSets-ICache_Info] MOV pc, lr 80 MOV a1, #0 STR a1, [a2, #ICache_NSets-ICache_Info] STR a1, [a2, #ICache_Size-ICache_Info] STRB a1, [a2, #ICache_LineLen-ICache_Info] STRB a1, [a2, #ICache_Associativity-ICache_Info] MOV pc, lr ; Create a list of CPUs, 16 bytes per entry: ; ID bits (1 word) ; Test mask for ID (1 word) ; Cache type register value (1 word) ; Processor type (1 byte) ; Architecture type (1 byte) ; Reserved (2 bytes) GBLA tempcpu MACRO CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln LCLA type type SETA (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos) tempcpu CSzDesc $dsz, $das, $dln type SETA type+(tempcpu:SHL:CT_Dsize_pos) [ :LNOT:($s=0 :LAND: "$isz"="") tempcpu CSzDesc $isz, $ias, $iln ] type SETA type+(tempcpu:SHL:CT_Isize_pos) ASSERT ($id :AND: :NOT: $mask) = 0 DCD $id, $mask, type DCB $proc, $arch, 0, 0 MEND MACRO $var CSzDesc $sz, $as, $ln $var SETA (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos) $var SETA $var+(CT_M_$sz:SHL:CT_M_pos) MEND ; CPUDesc table for ARMv3-ARMv6 KnownCPUTable ; /------Cache Type register fields-----\� ; ID reg Mask Arch Type S Dsz Das Dln Isz Ias Iln CPUDesc ARM600, &000600, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM610, &000610, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARMunk, &000000, &00F000, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM700, &007000, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710, &007100, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710a, &047100, &FDFFF0, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM7500, &067100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARM7500FE, &077100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARMunk, &007000, &80F000, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM720T, &807200, &FFFFF0, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc ARMunk, &807000, &80F000, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA110, &01A100, &0FFFF0, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA1100, &01A110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc SA1110, &01B110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc ARM920T, &029200, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8 CPUDesc ARM922T, &029220, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 8K, 64, 8, 8K, 64, 8 CPUDesc X80200, &052000, &0FFFF0, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 CPUDesc X80321, &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 DCD -1 ; Simplified CPUDesc table for ARMvF ; The cache size data is ignored for ARMv7. KnownCPUTable_Fancy CPUDesc Cortex_A8, &00C080, &00FFF0, ARMvF, WB_CR7_Lx, 1, 16K, 32, 16, 16K, 32, 16 CPUDesc ARM1176JZF_S, &00B760, &00FFF0, ARMv6, WB_CR7_LDa, 1, 16K, 32, 16,16K, 32, 16 DCD -1 ; Peculiar characteristics of individual ARMs not deducable otherwise. First field is ; flags to set, second flags to clear. KnownCPUFlags DCD 0, 0 ; ARM 600 DCD 0, 0 ; ARM 610 DCD 0, 0 ; ARM 700 DCD 0, 0 ; ARM 710 DCD 0, 0 ; ARM 710a DCD CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay, 0 ; SA 110 pre revT DCD CPUFlag_InterruptDelay, 0 ; SA 110 revT or later DCD 0, 0 ; ARM 7500 DCD 0, 0 ; ARM 7500FE DCD CPUFlag_InterruptDelay, 0 ; SA 1100 DCD CPUFlag_InterruptDelay, 0 ; SA 1110 DCD CPUFlag_NoWBDrain, 0 ; ARM 720T DCD 0, 0 ; ARM 920T DCD 0, 0 ; ARM 922T DCD CPUFlag_ExtendedPages+CPUFlag_XScale, 0 ; X80200 DCD CPUFlag_XScale, 0 ; X80321 DCD 0, 0 ; Cortex_A8 DCD 0, 0 ; ARM1176JZF_S [ MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; ----- ARM_Analyse_Fancy -------------------------------------------------- ; -------------------------------------------------------------------------- ; ; For ARMv7 ARMs (arch=&F), we can detect everything via the feature registers ; TODO - There's some stuff in here that can be tidied up/removed ; Things we need to set up: ; ProcessorType (as listed in hdr.ARMops) ; Cache_Type (CT_ctype_* from hdr:MEMM.ARM600) ; ProcessorArch (as reported by Init_ARMarch) ; ProcessorFlags (CPUFlag_* from hdr.ARMops) ; Proc_* (Cache/TLB/IMB/MMU function pointers) ; MMU_PCBTrans (Points to lookup table for translating page table cache options) ; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?) ARM_Analyse_Fancy Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 LDR v6, =ZeroPage ADRL v7, KnownCPUTable_Fancy 10 LDMIA v7!, {a1, a2} CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE %BT10 20 LDR v2, [v7] CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] MOV v5, #CPUFlag_32bitOS+CPUFlag_No26bitMode ; 26bit has been obsolete for a long time [ HiProcVecs ORR v5, v5, #CPUFlag_HiProcVecs ] ; Work out whether the cache info is in ARMv6 or ARMv7 style MRC p15, 0, a1, c0, c0, 1 TST a1, #&80000000 BNE %FT25 ; ARMv6 format cache type register. ; TODO - Use the cache type register to deduce the cache info. ; For now, just fall back on the values in the CPU table. ASSERT CT_Isize_pos = 0 MOV a1, v2 ADD a2, v6, #ICache_Info BL EvaluateCache MOV a1, v2, LSR #CT_Dsize_pos ADD a2, v6, #DCache_Info BL EvaluateCache B %FT27 25 ; ARMv7 format cache type register. This should(!) mean that we have the cache level ID register, and all the other ARMv7 cache registers. ; Do we have a split cache? MRC p15, 1, a1, c0, c0, 1 AND a2, a1, #7 TEQ a2, #3 ORREQ v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache 27 [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] MRC p15, 0, a1, c0, c2, 2 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_LongMul MRC p15, 0, a1, c0, c1, 0 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_Thumb MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this? ; Other flags not checked for above: ; CPUFlag_InterruptDelay ; CPUFlag_VectorReadException ; CPUFlag_ExtendedPages ; CPUFlag_NoWBDrain ; CPUFlag_AbortRestartBroken ; CPUFlag_XScale ; CPUFlag_XScaleJTAGconnected LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 STR v5, [v6, #ProcessorFlags] ; Cache analysis LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa BEQ Analyse_WB_CR7_LDa ; eg. ARM9 TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD BEQ Analyse_WB_Cal_LD ; assume XScale TEQ a2, #CT_ctype_WB_CR7_Lx BEQ Analyse_WB_CR7_Lx ; others ... B WeirdARMPanic ; stiff :) ] ; MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; ----- ARMops ------------------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMops are the routines required by the kernel for cache/MMU control ; the kernel vectors to the appropriate ops for the given ARM at boot ; ; The Rules: ; - These routines may corrupt a1 and lr only ; - (lr can of course only be corrupted whilst still returning to correct ; link address) ; - stack is available, at least 16 words can be stacked ; - a NULL op would be a simple MOV pc, lr ; ; -------------------------------------------------------------------------- ; ----- ARMops for ARMv3 --------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMv3 ARMs include ARM710, ARM610, ARM7500 ; Cache_Invalidate_ARMv3 MCR p15, 0, a1, c7, c0 NullOp MOV pc, lr WriteBuffer_Drain_ARMv3 ;swap always forces unbuffered write, stalling till WB empty SUB sp, sp, #4 SWP a1, a1, [sp] ADD sp, sp, #4 MOV pc, lr TLB_Invalidate_ARMv3 MCR p15, 0, a1, c5, c0 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_ARMv3 MCR p15, 0, a1, c6, c0 MOV pc, lr MMU_Changing_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr MMU_ChangingUncached_ARMv3 MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_ARMv3 Push "a2" MCR p15, 0, a1, c7, c0 ; invalidate cache 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry_ARMv3 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached_ARMv3 Push "a2" 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_ARMv3 ! 0, "arbitrary Cache_RangeThreshold_ARMv3" MOV a1, #16*PageSize MOV pc, lr LTORG ; -------------------------------------------------------------------------- ; ----- generic ARMops for simple ARMs, ARMv4 onwards ---------------------- ; -------------------------------------------------------------------------- ; ; eg. ARM7TDMI based ARMs, unified, writethrough cache ; Cache_InvalidateUnified MOV a1, #0 MCR p15, 0, a1, c7, c7 MOV pc, lr WriteBuffer_Drain_OffOn ; used if ARM has no drain WBuffer MCR op Push "a2" ARM_read_control a1 BIC a2, a1, #MMUC_W ARM_write_control a2 ARM_write_control a1 Pull "a2" MOV pc, lr WriteBuffer_Drain ; used if ARM has proper drain WBuffer MCR op MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 MOV pc, lr TLB_Invalidate_Unified MOV a1, #0 MCR p15, 0, a1, c8, c7 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_Unified MCR p15, 0, a1, c8, c7, 1 MOV pc, lr MMU_Changing_Writethrough MOV a1, #0 MCR p15, 0, a1, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr MMU_ChangingUncached MOV a1, #0 MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_Writethrough Push "a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry Pull "a4" MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_Writethrough ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_Writethrough Push "a2,a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2,a4" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached Push "a2" 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_Writethrough ! 0, "arbitrary Cache_RangeThreshold_Writethrough" MOV a1, #16*PageSize MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for ARM9 and the like --------------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with ; register 7, lockdown available (format A) ; ; Note that ARM920 etc have writeback/writethrough data cache selectable ; by MMU regions. For simpliciity, we assume cacheable pages are mostly ; writeback. Any writethrough pages will have redundant clean operations ; applied when moved, for example, but this is a small overhead (cleaning ; a clean line is very quick on ARM 9). Cache_CleanAll_WB_CR7_LDa ROUT ; ; only guarantees to clean lines not involved in interrupts (so we can ; clean without disabling interrupts) ; ; Clean cache by traversing all segment and index values ; As a concrete example, for ARM 920 (16k+16k caches) we would have: ; ; DCache_LineLen = 32 (32 byte cache line, segment field starts at bit 5) ; DCache_IndexBit = &04000000 (index field starts at bit 26) ; DCache_IndexSegStart = &000000E0 (start at index=0, segment = 7) ; Push "a2, ip" LDR ip, =ZeroPage LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c10, 2 ; clean DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer Pull "a2, ip" MOV pc, lr Cache_CleanInvalidateAll_WB_CR7_LDa ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "a2, ip" LDR ip, =ZeroPage LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c14, 2 ; clean&invalidate DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer MCR p15, 0, ip, c7, c5, 0 ; invalidate ICache Pull "a2, ip" MOV pc, lr Cache_InvalidateAll_WB_CR7_LDa ROUT ; ; no clean, assume caller knows what's happening ; MOV a1, #0 MCR p15, 0, a1, c7, c7, 0 ; invalidate ICache and DCache MOV pc, lr Cache_RangeThreshold_WB_CR7_LDa ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_CR7_LDa ROUT MMU_ChangingUncached_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; a1 = page affected (page aligned address) ; TLB_InvalidateEntry_WB_CR7_LDa ROUT MMU_ChangingUncachedEntry_WB_CR7_LDa MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MOV pc, lr WriteBuffer_Drain_WB_CR7_LDa ROUT MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV pc, lr IMB_Full_WB_CR7_LDa ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache ; Push "lr" BL Cache_CleanAll_WB_CR7_LDa ; also drains Wbuffer MOV a1, #0 MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_LDa ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_CR7_LDa Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry by VA MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "pc" MMU_Changing_WB_CR7_LDa ROUT Push "lr" BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_LDa ROUT Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV lr, #0 MCR p15, 0, lr, c7, c10, 4 ; drain WBuffer SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_LDa ROUT Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "a2" 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 Pull "a2" MOV pc, lr ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for StrongARM and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_Crd is Writeback data cache, clean by reading data from cleaner area ; Currently no support for mini data cache on some StrongARM variants. Mini ; cache is always writeback and must have cleaning support, so is very ; awkward to use for cacheable screen, say. ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; DCache_CleanBaseAddress : start address of total cleaner space ; DCache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered Cache_CleanAll_WB_Crd ROUT ; ; - cleans data cache (and invalidates it as a side effect) ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of StrongARM ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "a2-a4, v1, v2, pc" Cache_CleanInvalidateAll_WB_Crd ROUT IMB_Full_WB_Crd ; ;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not ;involved in interrupts - this is sufficient for OS requirements, and means we don't ;have to disable interrupts for possibly slow clean ; Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" Cache_InvalidateAll_WB_Crd ; ; no clean, assume caller knows what is happening ; MCR p15, 0, a1, c7, c7, 0 ;flush ICache and DCache MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr Cache_RangeThreshold_WB_Crd LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Crd MMU_ChangingUncached_WB_Crd MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr TLB_InvalidateEntry_WB_Crd MMU_ChangingUncachedEntry_WB_Crd MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB MOV pc, lr WriteBuffer_Drain_WB_Crd MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr IMB_Range_WB_Crd ROUT SUB a2, a2, a1 CMP a2, #64*1024 ;arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Crd Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" MMU_Changing_WB_Crd Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "pc" MMU_ChangingEntry_WB_Crd ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 SUB a1, a1, #PageSize MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, pc" MMU_ChangingEntries_WB_Crd ROUT ; ;same comments as MMU_ChangingEntry_WB_Crd ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MOV a1, lr ;restore start address 20 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, a3, pc" ; 30 BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Crd ROUT CMP a2, #32 ;arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize SUBS lr, lr, #1 BNE %BT10 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr ; ARMops for XScale, mjs Feb 2001 ; ; WB_Cal_LD is writeback, clean with allocate, lockdown ; ; If the mini data cache is used (XScaleMiniCache true), it is assumed to be ; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow ; mini cache clean for things like IMB_Full. ; ; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the ; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts ; off (else the main cache may be slightly dirty from interrupts, and the invalidate ; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow ; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate ; (and perhaps a line allocate) for the mini cache. ; ; We do not use lockdown. ; ; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough ; pages will be invalidated as if they were writeback, but there is little overhead ; (cleaning a clean line or allocating a line from cleaner area are both fast). ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; If the mini cache is used, it has its own equivalent cleaner space and algorithm. ; Parameters for each cache are: ; ; Cache_CleanBaseAddress : start address of total cleaner space ; Cache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered GBLL XScaleMiniCache ; *must* be configured writethrough if used XScaleMiniCache SETL {FALSE} ; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect ; corrupts a1 ; MACRO CPWAIT MRC p15, 0, a1, c2, c0, 0 ; arbitrary read of CP15 MOV a1, a1 ; wait for it ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller MEND Cache_CleanAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates as a side effect) ; - if mini cache is in use, will be writethrough so no clean required ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of XScale ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =ZeroPage+DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 MCR p15, 0, v1, c7, c2, 5 ; allocate address from cleaner space ADD v1, v1, a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "a2-a4, v1, v2, pc" [ XScaleMiniCache Cache_MiniInvalidateAll_WB_Cal_LD ROUT ; ; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and ; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough) ; Push "a2-a4, v1, v2, lr" LDR lr, =ZeroPage LDR a1, [lr, #MCache_CleanBaseAddress] LDR a2, =ZeroPage+MCache_CleanNextAddr LDR a3, [lr, #MCache_Size] LDRB a4, [lr, #MCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 ; read a line of cleaner data TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next ; note, no drain WBuffer, since we are really only invalidating a writethrough cache Pull "a2-a4, v1, v2, pc" ] ; XScaleMiniCache Cache_CleanInvalidateAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates wrt OS stuff as a side effect) ; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect ; Push "lr" BL Cache_CleanAll_WB_Cal_LD [ XScaleMiniCache BL Cache_MiniInvalidateAll_WB_Cal_LD ] MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" Cache_InvalidateAll_WB_Cal_LD ROUT ; ; no clean, assume caller knows what's happening ; MCR p15, 0, a1, c7, c7, 0 ; invalidate DCache, (MiniCache), ICache and BTB CPWAIT MOV pc, lr Cache_RangeThreshold_WB_Cal_LD ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Cal_LD ROUT MMU_ChangingUncached_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr TLB_InvalidateEntry_WB_Cal_LD ROUT MMU_ChangingUncachedEntry_WB_Cal_LD MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry CPWAIT MOV pc, lr WriteBuffer_Drain_WB_Cal_LD ROUT MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) MOV pc, lr IMB_Full_WB_Cal_LD Push "lr" BL Cache_CleanAll_WB_Cal_LD ; clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" IMB_Range_WB_Cal_LD ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Cal_LD Push "lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "pc" MMU_Changing_WB_Cal_LD ROUT Push "lr" BL Cache_CleanAll_WB_Cal_LD MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "pc" MMU_ChangingEntry_WB_Cal_LD ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry CPWAIT Pull "a2, pc" MMU_ChangingEntries_WB_Cal_LD ROUT ; ;same comments as MMU_ChangingEntry_WB_Cal_LD ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 CPWAIT Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Cal_LD ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry SUBS lr, lr, #1 ADD a1, a1, #PageSize BNE %BT10 CPWAIT Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr [ MEMM_Type = "VMSAv6" ; Need appropriate myIMB, etc. implementations if this is to be removed ; -------------------------------------------------------------------------- ; ----- ARMops for Cortex-A8 and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with ; register 7, and (potentially) multiple cache levels ; ; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length ; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length ; DCache_RangeThreshold = clean threshold for data cache ; Cache_Lx_Info = Cache level ID register ; Cache_Lx_DTable = Cache size identification register for all 8 data/unified caches ; Cache_Lx_ITable = Cache size identification register for all 8 instruction caches Cache_CleanAll_WB_CR7_Lx ROUT ; Clean cache by traversing all sets and ways for all data caches Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" LDR lr, =ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size)+2 MOV v3, v3, LSL #4 ; Start at bit 4 MOV v3, v3, LSL v1 ; Start at log2(Line size)+2 ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c10, 2 ; Clean ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 myDSB ,a1 ; Wait for cache cleaning to complete Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_CleanInvalidateAll_WB_CR7_Lx ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" LDR lr, =ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size)+2 MOV v3, v3, LSL #4 ; Start at bit 4 MOV v3, v3, LSL v1 ; Start at log2(Line size)+2 ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c14, 2 ; Clean & invalidate ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 MOV a1, #0 myDSB ,a1,,y ; Wait for cache clean to complete MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_InvalidateAll_WB_CR7_Lx ROUT ; ; no clean, assume caller knows what's happening ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" LDR lr, =ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size)+2 MOV v3, v3, LSL #4 ; Start at bit 4 MOV v3, v3, LSL v1 ; Start at log2(Line size)+2 ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c6, 2 ; Invalidate ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 MOV a1, #0 myDSB ,a1,,y ; Wait for invalidation to complete MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_RangeThreshold_WB_CR7_Lx ROUT LDR a1, =ZeroPage LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr MMU_ChangingUncached_WB_CR7_Lx myDSB ,a1 ; Ensure the page table write has actually completed myISB ,a1,,y ; Also required TLB_InvalidateAll_WB_CR7_Lx ROUT MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry_WB_CR7_Lx [ NoARMv7 Push "a2" myDSB ,a2 ; Ensure the page table write has actually completed myISB ,a2,,y ; Also required Pull "a2" | myDSB myISB ] TLB_InvalidateEntry_WB_CR7_Lx ROUT MCR p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible MOV pc, lr WriteBuffer_Drain_WB_CR7_Lx ROUT myDSB ,a1 ; DSB is the new name for write buffer draining myISB ,a1,,y ; Also do ISB for extra paranoia MOV pc, lr IMB_Full_WB_CR7_Lx ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache/branch predictor ; Luckily, we only need to clean as far as the level of unification ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" LDR lr, =ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info MOV a1, a1, LSR #27 AND a1, a1, #&7 ; Get level of unification MOV a2, #0 ; Current cache level SUBS a1, a1, #1 BLT %FT10 ; Cache clean complete 20 LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in lower 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size)+2 MOV v3, v3, LSL #4 ; Start at bit 4 MOV v3, v3, LSL v1 ; Start at log2(Line size)+2 ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c10, 2 ; Clean ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 SUBS a1, a1, #1 BGE %BT20 10 MOV a1, #0 myDSB ,a1,,y ; Wait for clean to complete MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1,,y ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_Lx ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take ADD a2, a2, a1 CMPLO a1, a2 ; The routine below will fail if the end address wraps around, so just IMB_Full instead BHS IMB_Full_WB_CR7_Lx Push "a1,a3,lr" LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a3, #4 MOV lr, a3, LSL lr 10 MCR p15, 0, a1, c7, c11, 1 ; clean DCache entry by VA to PoU ADD a1, a1, lr CMP a1, a2 BLO %BT10 myDSB ,a1 ; Wait for clean to complete Pull "a1" ; Get start address back LDR lr, =ZeroPage LDRB lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV lr, a3, LSL lr 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 ; Wait for cache/branch invalidation to complete myISB ,a1,,y ; Ensure that the effects of the completed cache/branch invalidation are visible Pull "a3,pc" MMU_Changing_WB_CR7_Lx ROUT Push "lr" myDSB ,a1 ; Ensure the page table write has actually completed myISB ,a1,,y ; Also required BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB myDSB ,a1,,y ; Wait TLB invalidation to complete myISB ,a1,,y ; Ensure that the effects are visible Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_Lx ROUT Push "a2, lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required LDR lr, =ZeroPage LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a2, #4 MOV lr, a2, LSL lr ADD a2, a1, #PageSize 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC ADD a1, a1, lr CMP a1, a2 BNE %BT10 myDSB ,lr ; Wait for clean to complete LDR lr, =ZeroPage LDRB lr, [lr, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV a1, #4 MOV lr, a1, LSL lr SUB a1, a2, #PageSize ; Get start address back 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, lr CMP a1, a2 BNE %BT10 SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB and ITLB MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 myISB ,a1,,y Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_Lx ROUT Push "a2, a3, lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required MOV a2, a2, LSL #Log2PageSize LDR lr, =ZeroPage LDR a3, [lr, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) LDRB a3, [lr, #DCache_LineLen] ; log2(line len)-2 MOV lr, #4 MOV a3, lr, LSL a3 MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC ADD a1, a1, a3 CMP a1, a2 BNE %BT10 myDSB ,a3 ; Wait for clean to complete LDR a3, =ZeroPage LDRB a3, [a3, #ICache_LineLen] ; Use ICache line length, just in case D&I length differ MOV a1, #4 MOV a3, a1, LSL a3 MOV a1, lr ; Get start address back 10 MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, a3 CMP a1, a2 BNE %BT10 20 MCR p15, 0, lr, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD lr, lr, #PageSize CMP lr, a2 BNE %BT20 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,a1 myISB ,a1,,y Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB myDSB ,a1,,y ; Wait TLB invalidation to complete myISB ,a1,,y ; Ensure that the effects are visible Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT Push "a2,lr" myDSB ,lr ; Ensure the page table write has actually completed myISB ,lr,,y ; Also required CMP a2, #32 ; arbitrary-ish threshold MCRHS p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB BHS %FT20 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 20 MCR p15, 0, a1, c7, c5, 6 ; invalidate branch predictors myDSB ,lr,,y myISB ,lr,,y Pull "a2,pc" ] ; MEMM_Type = "VMSAv6" ; -------------------------------------------------------------------------- ; IMPORT Write0_Translated ARM_PrintProcessorType LDR a1, =ZeroPage LDRB a1, [a1, #ProcessorType] TEQ a1, #ARMunk MOVEQ pc, lr Push "lr" ADR a2, PNameTable LDHA a1, a2, a1, a3 ADD a1, a2, a1 BL Write0_Translated SWI XOS_NewLine SWI XOS_NewLine Pull "pc" PNameTable DCW PName_ARM600 - PNameTable DCW PName_ARM610 - PNameTable DCW PName_ARM700 - PNameTable DCW PName_ARM710 - PNameTable DCW PName_ARM710a - PNameTable DCW PName_SA110 - PNameTable ; pre rev T DCW PName_SA110 - PNameTable ; rev T or later DCW PName_ARM7500 - PNameTable DCW PName_ARM7500FE - PNameTable DCW PName_SA1100 - PNameTable DCW PName_SA1110 - PNameTable DCW PName_ARM720T - PNameTable DCW PName_ARM920T - PNameTable DCW PName_ARM922T - PNameTable DCW PName_X80200 - PNameTable DCW PName_X80321 - PNameTable DCW PName_Cortex_A8 - PNameTable DCW PName_ARM1176JZF_S - PNameTable PName_ARM600 = "600:ARM 600 Processor",0 PName_ARM610 = "610:ARM 610 Processor",0 PName_ARM700 = "700:ARM 700 Processor",0 PName_ARM710 = "710:ARM 710 Processor",0 PName_ARM710a = "710a:ARM 710a Processor",0 PName_SA110 = "SA110:SA-110 Processor",0 PName_ARM7500 = "7500:ARM 7500 Processor",0 PName_ARM7500FE = "7500FE:ARM 7500FE Processor",0 PName_SA1100 = "SA1100:SA-1100 Processor",0 PName_SA1110 = "SA1110:SA-1110 Processor",0 PName_ARM720T = "720T:ARM 720T Processor",0 PName_ARM920T = "920T:ARM 920T Processor",0 PName_ARM922T = "922T:ARM 922T Processor",0 PName_X80200 = "X80200:80200 Processor",0 PName_X80321 = "X80321:80321 Processor",0 PName_Cortex_A8 = "CortexA8:Cortex-A8 Processor",0 PName_ARM1176JZF_S = "ARM1176JZF_S:ARM1176JZF-S Processor",0 ALIGN ; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0) ; to XCB bits in page table descriptors. XCB_NB * 1:SHL:0 XCB_NC * 1:SHL:1 XCB_P * 1:SHL:2 ALIGN 32 ; WT read-allocate cache (eg ARM720T) XCBTableWT ; C+B CNB NCB NCNB = L2_C+L2_B, L2_C, L2_B, 0 ; Default = L2_C+L2_B, L2_C, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X ; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB XCBTableSA110 = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T) XCBTableWBR = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; SA-1110 - WB only read allocate cache, merging WB, mini D-cache XCBTableSA1110 = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, 0, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to read-allocate XCBTableXScaleRA = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to write-allocate XCBTableXScaleWA = L2_X+L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X END