; Copyright 2000 Pace Micro Technology plc ; ; Licensed under the Apache License, Version 2.0 (the "License"); ; you may not use this file except in compliance with the License. ; You may obtain a copy of the License at ; ; http://www.apache.org/licenses/LICENSE-2.0 ; ; Unless required by applicable law or agreed to in writing, software ; distributed under the License is distributed on an "AS IS" BASIS, ; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; See the License for the specific language governing permissions and ; limitations under the License. ; ; GET Hdr:ListOpts ; GET Hdr:Macros ; GET Hdr:System ; $GetCPU ; $GetMEMM ; GET hdr.Options ; GET Hdr:PublicWS ; GET Hdr:KernelWS ; GET hdr.Copro15ops ; GET hdr.ARMops v7 RN 10 ; EXPORT Init_ARMarch ; EXPORT ARM_Analyse ; EXPORT ARM_PrintProcessorType ; AREA KernelCode,CODE,READONLY ; ARM keep changing their mind about ID field layout. ; Here's a summary, courtesy of the ARM ARM (v5): ; ; pre-ARM 7: xxxx0xxx ; ARM 7: xxxx7xxx where bit 23 indicates v4T/~v3 ; post-ARM 7: xxxanxxx where n<>0 or 7 and a = architecture (1=4,2=4T,3=5,4=5T) ; ; int Init_ARMarch(void) ; Returns architecture, as above in a1. Also EQ if ARMv3, NE if ARMv4 or later. ; Corrupts only ip, no RAM usage. Init_ARMarch ARM_read_ID ip ANDS a1, ip, #&0000F000 MOVEQ pc, lr ; ARM 3 or ARM 6 TEQ a1, #&00007000 BNE %FT20 TST ip, #&00800000 ; ARM 7 - check for Thumb MOVNE a1, #ARMv4T MOVEQ a1, #ARMv3 MOV pc, lr 20 ANDS a1, ip, #&000F0000 ; post-ARM 7 MOV a1, a1, LSR #16 MOV pc, lr ARM_Analyse MOV a2, lr BL Init_ARMarch MOV lr, a2 CMP a1, #ARMvF BEQ ARM_Analyse_Fancy ; New ARM; use the feature regs to perform all the setup Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 ARM_read_cachetype v2 MOV v6, #ZeroPage ADRL v7, KnownCPUTable FindARMloop LDMIA v7!, {a1, a2} ; See if it's a known ARM CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE FindARMloop TEQ v2, v1 ; If we don't have cache attributes, read from table LDREQ v2, [v7] 20 TEQ v2, v1 BEQ %BT20 ; Cache unknown: panic CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] ASSERT CT_Isize_pos = 0 MOV a1, v2 ADD a2, v6, #ICache_Info BL EvaluateCache MOV a1, v2, LSR #CT_Dsize_pos ADD a2, v6, #DCache_Info BL EvaluateCache AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] [ No26bitCode MOV v5, #CPUFlag_32bitOS | MOV v5, #0 ] TST v2, #CT_S ORRNE v5, v5, #CPUFlag_SplitCache+CPUFlag_SynchroniseCodeAreas [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 [ 0=1 ; Check whether 26-bit mode is available MSR CPSR_c, #F32_bit+I32_bit+SVC26_mode MRS a1, CPSR AND a1, a1, #M32_bits TEQ a1, #SVC26_mode ORRNE v5, v5, #CPUFlag_No26bitMode MSREQ CPSR_c, #F32_bit+I32_bit+SVC32_mode BNE %FT35 ; Do we get vector exceptions on read? MOV a1, #0 LDR a1, [a1] ; If this aborts a1 will be left unchanged TEQ a1, #0 ORREQ v5, v5, #CPUFlag_VectorReadException ] 35 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] TEQ a1, #ARMv3 ; assume long multiply available ORRNE v5, v5, #CPUFlag_LongMul ; if v4 or later TEQNE a1, #ARMv4 ; assume 26-bit available ORRNE v5, v5, #CPUFlag_No26bitMode ; iff v3 or v4 (not T) TEQNE a1, #ARMv5 ; assume Thumb available ORRNE v5, v5, #CPUFlag_Thumb ; iff not v3,v4,v5 MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 [ XScaleJTAGDebug TST v5, #CPUFlag_XScale BEQ %FT40 MRC p14, 0, a2, c10, c0 ; Read debug control register TST a2, #&80000000 ORRNE v5, v5, #CPUFlag_XScaleJTAGconnected MOVEQ a2, #&C000001C ; enable hot debug MCREQ p14, 0, a2, c10, c0 BNE %FT40 40 ] STR v5, [v6, #ProcessorFlags] ; Now, a1 = processor architecture (ARMv3, ARMv4 ...) ; v4 = processor type (ARM600, ARM610, ...) ; v5 = processor flags CMP a1, #ARMv4 BLO Analyse_ARMv3 ; eg. ARM710 LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa BEQ Analyse_WB_CR7_LDa ; eg. ARM9 TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD BEQ Analyse_WB_Cal_LD ; assume XScale ; others ... WeirdARMPanic B WeirdARMPanic ; stiff :) Analyse_ARMv3 ADRL a1, NullOp ADRL a2, Cache_Invalidate_ARMv3 ADRL a3, WriteBuffer_Drain_ARMv3 ADRL a4, TLB_Invalidate_ARMv3 ADRL ip, TLB_InvalidateEntry_ARMv3 STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_ARMv3 ADRL a2, MMU_ChangingEntry_ARMv3 ADRL a3, MMU_ChangingUncached_ARMv3 ADRL a4, MMU_ChangingUncachedEntry_ARMv3 STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_ARMv3 ADRL a2, MMU_ChangingUncachedEntries_ARMv3 ADRL a3, Cache_RangeThreshold_ARMv3 STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WriteThroughUnified ADRL a1, NullOp ADRL a2, Cache_InvalidateUnified TST v5, #CPUFlag_NoWBDrain ADRNEL a3, WriteBuffer_Drain_OffOn ADREQL a3, WriteBuffer_Drain ADRL a4, TLB_Invalidate_Unified ADRL ip, TLB_InvalidateEntry_Unified STR a1, [v6, #Proc_Cache_CleanAll] STR a2, [v6, #Proc_Cache_CleanInvalidateAll] STR a2, [v6, #Proc_Cache_InvalidateAll] STR a3, [v6, #Proc_WriteBuffer_Drain] STR a4, [v6, #Proc_TLB_InvalidateAll] STR ip, [v6, #Proc_TLB_InvalidateEntry] STR a1, [v6, #Proc_IMB_Full] STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_Writethrough ADRL a2, MMU_ChangingEntry_Writethrough ADRL a3, MMU_ChangingUncached ADRL a4, MMU_ChangingUncachedEntry STR a1, [v6, #Proc_MMU_Changing] STR a2, [v6, #Proc_MMU_ChangingEntry] STR a3, [v6, #Proc_MMU_ChangingUncached] STR a4, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_Writethrough ADRL a2, MMU_ChangingUncachedEntries ADRL a3, Cache_RangeThreshold_Writethrough STR a1, [v6, #Proc_MMU_ChangingEntries] STR a2, [v6, #Proc_MMU_ChangingUncachedEntries] STR a3, [v6, #Proc_Cache_RangeThreshold] ADRL a1, XCBTableWT STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_CR7_LDa TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here (eg. ARM920) ADRL a1, Cache_CleanInvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_LDa STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_LDa STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_LDa STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_LDa STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_LDa STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_LDa STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] MOV a1, #0 LDRB a2, [a1, #DCache_Associativity] MOV a3, #256 MOV a4, #8 ; to find log2(ASSOC), rounded up Analyse_WB_CR7_LDa_L1 MOV a3, a3, LSR #1 SUB a4, a4, #1 CMP a2, a3 BLO Analyse_WB_CR7_LDa_L1 ADDHI a4, a4, #1 RSB a2, a4, #32 MOV a3, #1 MOV a3, a3, LSL a2 STR a3, [a1, #DCache_IndexBit] LDR a4, [a1, #DCache_NSets] LDRB a2, [a1, #DCache_LineLen] SUB a4, a4, #1 MUL a4, a2, a4 STR a4, [a1, #DCache_IndexSegStart] MOV a2, #64*1024 ; arbitrary-ish STR a2, [a1, #DCache_RangeThreshold] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] B %FT90 Analyse_WB_Crd TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Crd STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Crd STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Crd STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Crd STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Crd STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Crd STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Crd STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Crd STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Crd STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] MOV a1, #0 LDR a2, =DCacheCleanAddress STR a2, [a1, #DCache_CleanBaseAddress] STR a2, [a1, #DCache_CleanNextAddress] MOV a2, #64*1024 ;arbitrary-ish threshold STR a2, [a1, #DCache_RangeThreshold] LDRB a2, [a1, #ProcessorType] TEQ a2, #SA110 ADREQL a2, XCBTableSA110 BEQ Analyse_WB_Crd_finish TEQ a2, #SA1100 TEQNE a2, #SA1110 ADREQL a2, XCBTableSA1110 ADRNEL a2, XCBTableWBR Analyse_WB_Crd_finish STR a2, [a1, #MMU_PCBTrans] B %FT90 Analyse_WB_Cal_LD TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard ADRL a1, Cache_CleanInvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_Cal_LD STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_Cal_LD STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_Cal_LD STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_Cal_LD STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_Cal_LD STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_Cal_LD STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_Cal_LD STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] MOV a1, #0 LDR a2, =DCacheCleanAddress STR a2, [a1, #DCache_CleanBaseAddress] STR a2, [a1, #DCache_CleanNextAddress] [ XScaleMiniCache ! 1, "You need to arrange for XScale mini-cache clean area to be mini-cacheable" LDR a2, =DCacheCleanAddress + 4 * 32*1024 STR a2, [a1, #MCache_CleanBaseAddress] STR a2, [a1, #MCache_CleanNextAddress] ] ; arbitrary-ish values, mini cache makes global op significantly more expensive [ XScaleMiniCache MOV a2, #128*1024 | MOV a2, #32*1024 ] STR a2, [a1, #DCache_RangeThreshold] ; enable full coprocessor access LDR a2, =&3FFF MCR p15, 0, a2, c15, c1 ADRL a2, XCBTableXScaleWA ; choose between RA and WA here STR a2, [a1, #MMU_PCBTrans] B %FT90 Analyse_WB_CR7_Lx TST v5, #CPUFlag_SplitCache BEQ WeirdARMPanic ; currently, only support harvard caches here ; Read the cache info into Cache_Lx_* MRC p15, 1, a1, c0, c0, 1 ; Cache level ID register MOV a2, v6 ; Work around DTable/ITable alignment issues STR a1, [v2, #Cache_Lx_Info]! ADD a1, v2, #Cache_Lx_DTable-Cache_Lx_Info ADD a2, v2, #Cache_Lx_ITable-Cache_Lx_Info MOV a3, #0 MOV a4, #256 ; Smallest instruction cache line length MOV v2, #256 ; Smallest data/unified cache line length (although atm we only need this to be the smallest data cache line length) 10 MCR p15, 2, a3, c0, c0, 0 ; Program cache size selection register MRC p15, 1, v1, c0, c0, 0 ; Get size info (data/unified) STR v1, [a1,#4] CMP v1, #0 ; Does the cache exist? AND v1, v1, #7 ; Get line size CMPNE v1, v2 MOVLT v2, v1 ; Earlier CMP will not set LE flags if v1=0 ADD a3, a3, #1 MCR p15, 2, a3, c0, c0, 0 ; Program cache size selection register MRC p15, 1, v1, c0, c0, 0 ; Get size info (instruction) STR v1, [a2,#4] CMP v1, #0 ; Does the cache exist? AND v1, v1, #7 ; Get line size CMPNE v1, a4 MOVLT a4, v1 ; Earlier CMP will not set LE flags if v1=0 ADD a3, a3, #1 CMP a3, #16 BLT %BT10 STRB a4, [v6, #ICache_LineLen] ; Store log2(line size)-2 STRB v2, [v6, #DCache_LineLen] ; log2(line size)-2 ; Calculate DCache_RangeThreshold MOV a1, #128*1024 ; Arbitrary-ish STR a1, [v6, #DCache_RangeThreshold] ADRL a1, Cache_CleanInvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanInvalidateAll] ADRL a1, Cache_CleanAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_CleanAll] ADRL a1, Cache_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_Cache_InvalidateAll] ADRL a1, Cache_RangeThreshold_WB_CR7_Lx STR a1, [v6, #Proc_Cache_RangeThreshold] ADRL a1, TLB_InvalidateAll_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateAll] ADRL a1, TLB_InvalidateEntry_WB_CR7_Lx STR a1, [v6, #Proc_TLB_InvalidateEntry] ADRL a1, WriteBuffer_Drain_WB_CR7_Lx STR a1, [v6, #Proc_WriteBuffer_Drain] ADRL a1, IMB_Full_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Full] ADRL a1, IMB_Range_WB_CR7_Lx STR a1, [v6, #Proc_IMB_Range] ADRL a1, MMU_Changing_WB_CR7_Lx STR a1, [v6, #Proc_MMU_Changing] ADRL a1, MMU_ChangingEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntry] ADRL a1, MMU_ChangingUncached_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncached] ADRL a1, MMU_ChangingUncachedEntry_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntry] ADRL a1, MMU_ChangingEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingEntries] ADRL a1, MMU_ChangingUncachedEntries_WB_CR7_Lx STR a1, [v6, #Proc_MMU_ChangingUncachedEntries] ADRL a1, XCBTableWBR ; assume read-allocate WB/WT cache STR a1, [v6, #MMU_PCBTrans] B %FT90 90 Pull "v1,v2,v5,v6,v7,pc" ; This routine works out the values LINELEN, ASSOCIATIVITY, NSETS and CACHE_SIZE defined in section ; B2.3.3 of the ARMv5 ARM. EvaluateCache AND a3, a1, #CT_assoc_mask+CT_M TEQ a3, #(CT_assoc_0:SHL:CT_assoc_pos)+CT_M BEQ %FT80 MOV ip, #1 ASSERT CT_len_pos = 0 AND a4, a1, #CT_len_mask ADD a4, a4, #3 MOV a4, ip, LSL a4 ; LineLen = 1 << (len+3) STRB a4, [a2, #ICache_LineLen-ICache_Info] MOV a3, #2 TST a1, #CT_M ADDNE a3, a3, #1 ; Multiplier = 2 + M AND a4, a1, #CT_assoc_mask RSB a4, ip, a4, LSR #CT_assoc_pos MOV a4, a3, LSL a4 ; Associativity = Multiplier << (assoc-1) STRB a4, [a2, #ICache_Associativity-ICache_Info] AND a4, a1, #CT_size_mask MOV a4, a4, LSR #CT_size_pos MOV a3, a3, LSL a4 MOV a3, a3, LSL #8 ; Size = Multiplier << (size+8) STR a3, [a2, #ICache_Size-ICache_Info] ADD a4, a4, #6 AND a3, a1, #CT_assoc_mask SUB a4, a4, a3, LSR #CT_assoc_pos AND a3, a1, #CT_len_mask ASSERT CT_len_pos = 0 SUB a4, a4, a3 MOV a4, ip, LSL a4 ; NSets = 1 << (size + 6 - assoc - len) STR a4, [a2, #ICache_NSets-ICache_Info] MOV pc, lr 80 MOV a1, #0 STR a1, [a2, #ICache_NSets-ICache_Info] STR a1, [a2, #ICache_Size-ICache_Info] STRB a1, [a2, #ICache_LineLen-ICache_Info] STRB a1, [a2, #ICache_Associativity-ICache_Info] MOV pc, lr ; Create a list of CPUs, 16 bytes per entry: ; ID bits (1 word) ; Test mask for ID (1 word) ; Cache type register value (1 word) ; Processor type (1 byte) ; Architecture type (1 byte) ; Reserved (2 bytes) GBLA tempcpu MACRO CPUDesc $proc, $id, $mask, $arch, $type, $s, $dsz, $das, $dln, $isz, $ias, $iln LCLA type type SETA (CT_ctype_$type:SHL:CT_ctype_pos)+($s:SHL:CT_S_pos) tempcpu CSzDesc $dsz, $das, $dln type SETA type+(tempcpu:SHL:CT_Dsize_pos) [ :LNOT:($s=0 :LAND: "$isz"="") tempcpu CSzDesc $isz, $ias, $iln ] type SETA type+(tempcpu:SHL:CT_Isize_pos) ASSERT ($id :AND: :NOT: $mask) = 0 DCD $id, $mask, type DCB $proc, $arch, 0, 0 MEND MACRO $var CSzDesc $sz, $as, $ln $var SETA (CT_size_$sz:SHL:CT_size_pos)+(CT_assoc_$as:SHL:CT_assoc_pos)+(CT_len_$ln:SHL:CT_len_pos) $var SETA $var+(CT_M_$sz:SHL:CT_M_pos) MEND KnownCPUTable ; /------Cache Type register fields-----\  ; ID reg Mask Arch Type S Dsz Das Dln Isz Ias Iln CPUDesc ARM600, &000600, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM610, &000610, &00FFF0, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARMunk, &000000, &00F000, ARMv3, WT, 0, 4K, 64, 4 CPUDesc ARM700, &007000, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710, &007100, &FFFFF0, ARMv3, WT, 0, 8K, 4, 8 CPUDesc ARM710a, &047100, &FDFFF0, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM7500, &067100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARM7500FE, &077100, &FFFFF0, ARMv3, WT, 0, 4K, 4, 4 CPUDesc ARMunk, &007000, &80F000, ARMv3, WT, 0, 8K, 4, 4 CPUDesc ARM720T, &807200, &FFFFF0, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc ARMunk, &807000, &80F000, ARMv4T, WT, 0, 8K, 4, 4 CPUDesc SA110_preRevT, &01A100, &0FFFFC, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA110, &01A100, &0FFFF0, ARMv4, WB_Crd, 1, 16K, 32, 8, 16K, 32, 8 CPUDesc SA1100, &01A110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc SA1110, &01B110, &0FFFF0, ARMv4, WB_Crd, 1, 8K, 32, 8, 16K, 32, 8 CPUDesc ARM920T, &029200, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 16K, 64, 8, 16K, 64, 8 CPUDesc ARM922T, &029220, &0FFFF0, ARMv4T, WB_CR7_LDa, 1, 8K, 64, 8, 8K, 64, 8 CPUDesc X80200, &052000, &0FFFF0, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 CPUDesc X80321, &69052400, &FFFFF700, ARMv5TE, WB_Cal_LD, 1, 32K, 32, 8, 32K, 32, 8 DCD -1 ; Simplified CPUDesc table for Fancy ARMs ; The cache size data is ignored KnownCPUTable_Fancy CPUDesc Cortex_A8, &00C080, &00FFF0, ARMv5TE, WB_CR7_Lx, 1, 16K, 32, 16, 16K, 32, 16 DCD -1 ; Peculiar characteristics of individual ARMs not deducable otherwise. First field is ; flags to set, second flags to clear. KnownCPUFlags DCD 0, 0 ; ARM 600 DCD 0, 0 ; ARM 610 DCD 0, 0 ; ARM 700 DCD 0, 0 ; ARM 710 DCD 0, 0 ; ARM 710a DCD CPUFlag_AbortRestartBroken+CPUFlag_InterruptDelay, 0 ; SA 110 pre revT DCD CPUFlag_InterruptDelay, 0 ; SA 110 revT or later DCD 0, 0 ; ARM 7500 DCD 0, 0 ; ARM 7500FE DCD CPUFlag_InterruptDelay, 0 ; SA 1100 DCD CPUFlag_InterruptDelay, 0 ; SA 1110 DCD CPUFlag_NoWBDrain, 0 ; ARM 720T DCD 0, 0 ; ARM 920T DCD 0, 0 ; ARM 922T DCD CPUFlag_ExtendedPages+CPUFlag_XScale, 0 ; X80200 DCD CPUFlag_XScale, 0 ; X80321 DCD 0, 0 ; Cortex_A8 ; -------------------------------------------------------------------------- ; ----- ARM_Analyse_Fancy -------------------------------------------------- ; -------------------------------------------------------------------------- ; ; Although I don't have a copy of the ARMv7 ARM to check, I suspect that all ; ARMs with an architecture of &F implement the feature registers described ; in the Cortex-A8 TRM. Thus, for these new ARMs, we shall use the feature ; registers to discover what the CPU is like. ; Things we need to set up: ; ProcessorType (as listed in hdr.ARMops) ; Cache_Type (CT_ctype_* from hdr:MEMM.ARM600) ; ProcessorArch (as reported by Init_ARMarch) ; ProcessorFlags (CPUFlag_* from hdr.ARMops) ; Proc_* (Cache/TLB/IMB/MMU function pointers) ; MMU_PCBTrans (Points to lookup table for translating page table cache options) ; ICache_*, DCache_* (ICache, DCache properties - optional, since not used externally?) ARM_Analyse_Fancy Push "v1,v2,v5,v6,v7,lr" ARM_read_ID v1 MOV v6, #ZeroPage ADRL v7, KnownCPUTable_Fancy 10 LDMIA v7!, {a1, a2} CMP a1, #-1 BEQ %FT20 AND a2, v1, a2 TEQ a1, a2 ADDNE v7, v7, #8 BNE %BT10 20 LDR v2, [v7] CMP a1, #-1 LDRNEB a2, [v7, #4] MOVEQ a2, #ARMunk STRB a2, [v6, #ProcessorType] AND a1, v2, #CT_ctype_mask MOV a1, a1, LSR #CT_ctype_pos STRB a1, [v6, #Cache_Type] MOV v5, #CPUFlag_32bitOS+CPUFlag_No26bitMode ; 26bit has been obsolete for a long time ; Do we have a split cache? MRC p15, 1, a1, c0, c0, 1 AND a2, a1, #7 TEQ a2, #3 ORREQ v5, v5, #CPUFlag_SynchroniseCodeAreas+CPUFlag_SplitCache [ CacheOff ORR v5, v5, #CPUFlag_SynchroniseCodeAreas | ARM_read_control a1 ; if Z bit set then we have branch prediction, TST a1, #MMUC_Z ; so we need OS_SynchroniseCodeAreas even if not ORRNE v5, v5, #CPUFlag_SynchroniseCodeAreas ; split caches ] ; Test abort timing (base restored or base updated) MOV a1, #&8000 LDR a2, [a1], #4 ; Will abort - DAb handler will continue execution TEQ a1, #&8000 ORREQ v5, v5, #CPUFlag_BaseRestored ; Check store of PC 30 STR pc, [sp, #-4]! ADR a2, %BT30 + 8 LDR a1, [sp], #4 TEQ a1, a2 ORREQ v5, v5, #CPUFlag_StorePCplus8 BL Init_ARMarch STRB a1, [v6, #ProcessorArch] MRC p15, 0, a1, c0, c2, 2 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_LongMul MRC p15, 0, a1, c0, c1, 0 TST a1, #&F000 ORRNE v5, v5, #CPUFlag_Thumb MSR CPSR_f, #Q32_bit MRS lr, CPSR TST lr, #Q32_bit ORRNE v5, v5, #CPUFlag_DSP ; Should we check instruction set attr register 3 for this? ; Other flags not checked for above: ; CPUFlag_InterruptDelay ; CPUFlag_VectorReadException ; CPUFlag_ExtendedPages ; CPUFlag_NoWBDrain ; CPUFlag_AbortRestartBroken ; CPUFlag_XScale ; CPUFlag_XScaleJTAGconnected LDRB v4, [v6, #ProcessorType] TEQ v4, #ARMunk ; Modify deduced flags ADRNEL lr, KnownCPUFlags ADDNE lr, lr, v4, LSL #3 LDMNEIA lr, {a2, a3} ORRNE v5, v5, a2 BICNE v5, v5, a3 STR v5, [v6, #ProcessorFlags] ; Cache analysis LDRB a2, [v6, #Cache_Type] TEQ a2, #CT_ctype_WT TSTEQ v5, #CPUFlag_SplitCache BEQ Analyse_WriteThroughUnified ; eg. ARM7TDMI derivative TEQ a2, #CT_ctype_WB_CR7_LDa BEQ Analyse_WB_CR7_LDa ; eg. ARM9 TEQ a2, #CT_ctype_WB_Crd BEQ Analyse_WB_Crd ; eg. StrongARM TEQ a2, #CT_ctype_WB_Cal_LD BEQ Analyse_WB_Cal_LD ; assume XScale TEQ a2, #CT_ctype_WB_CR7_Lx BEQ Analyse_WB_CR7_Lx ; others ... B WeirdARMPanic ; stiff :) ; -------------------------------------------------------------------------- ; ----- ARMops ------------------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMops are the routines required by the kernel for cache/MMU control ; the kernel vectors to the appropriate ops for the given ARM at boot ; ; The Rules: ; - These routines may corrupt a1 and lr only ; - (lr can of course only be corrupted whilst still returning to correct ; link address) ; - stack is available, at least 16 words can be stacked ; - a NULL op would be a simple MOV pc, lr ; ; -------------------------------------------------------------------------- ; ----- ARMops for ARMv3 --------------------------------------------------- ; -------------------------------------------------------------------------- ; ; ARMv3 ARMs include ARM710, ARM610, ARM7500 ; Cache_Invalidate_ARMv3 MCR p15, 0, a1, c7, c0 NullOp MOV pc, lr WriteBuffer_Drain_ARMv3 ;swap always forces unbuffered write, stalling till WB empty SUB sp, sp, #4 SWP a1, a1, [sp] ADD sp, sp, #4 MOV pc, lr TLB_Invalidate_ARMv3 MCR p15, 0, a1, c5, c0 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_ARMv3 MCR p15, 0, a1, c6, c0 MOV pc, lr MMU_Changing_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr MMU_ChangingUncached_ARMv3 MCR p15, 0, a1, c5, c0 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_ARMv3 MCR p15, 0, a1, c7, c0 ; invalidate cache MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_ARMv3 Push "a2" MCR p15, 0, a1, c7, c0 ; invalidate cache 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry_ARMv3 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_ARMv3 ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached_ARMv3 Push "a2" 10 MCR p15, 0, a1, c6, c0 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_ARMv3 ! 0, "arbitrary Cache_RangeThreshold_ARMv3" MOV a1, #16*PageSize MOV pc, lr LTORG ; -------------------------------------------------------------------------- ; ----- generic ARMops for simple ARMs, ARMv4 onwards ---------------------- ; -------------------------------------------------------------------------- ; ; eg. ARM7TDMI based ARMs, unified, writethrough cache ; Cache_InvalidateUnified MOV a1, #0 MCR p15, 0, a1, c7, c7 MOV pc, lr WriteBuffer_Drain_OffOn ; used if ARM has no drain WBuffer MCR op Push "a2" ARM_read_control a1 BIC a2, a1, #MMUC_W ARM_write_control a2 ARM_write_control a1 Pull "a2" MOV pc, lr WriteBuffer_Drain ; used if ARM has proper drain WBuffer MCR op MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 MOV pc, lr TLB_Invalidate_Unified MOV a1, #0 MCR p15, 0, a1, c8, c7 MOV pc, lr ; a1 = page entry to invalidate (page aligned address) ; TLB_InvalidateEntry_Unified MCR p15, 0, a1, c8, c7, 1 MOV pc, lr MMU_Changing_Writethrough MOV a1, #0 MCR p15, 0, a1, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr MMU_ChangingUncached MOV a1, #0 MCR p15, 0, a1, c8, c7 ; invalidate TLB MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_Writethrough Push "a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry Pull "a4" MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_Writethrough ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_Changing_Writethrough Push "a2,a4" MOV a4, #0 MCR p15, 0, a4, c7, c7 ; invalidate cache 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2,a4" MOV pc, lr ; a1 = page affected (page aligned address) ; MMU_ChangingUncachedEntry MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry MOV pc, lr ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries ROUT CMP a2, #16 ; arbitrary-ish threshold BHS MMU_ChangingUncached Push "a2" 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate TLB entry SUBS a2, a2, #1 ; next page ADD a1, a1, #PageSize BNE %BT10 Pull "a2" MOV pc, lr Cache_RangeThreshold_Writethrough ! 0, "arbitrary Cache_RangeThreshold_Writethrough" MOV a1, #16*PageSize MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for ARM9 and the like --------------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_LDa refers to ARMs with writeback data cache, cleaned with ; register 7, lockdown available (format A) ; ; Note that ARM920 etc have writeback/writethrough data cache selectable ; by MMU regions. For simpliciity, we assume cacheable pages are mostly ; writeback. Any writethrough pages will have redundant clean operations ; applied when moved, for example, but this is a small overhead (cleaning ; a clean line is very quick on ARM 9). Cache_CleanAll_WB_CR7_LDa ROUT ; ; only guarantees to clean lines not involved in interrupts (so we can ; clean without disabling interrupts) ; ; Clean cache by traversing all segment and index values ; As a concrete example, for ARM 920 (16k+16k caches) we would have: ; ; DCache_LineLen = 32 (32 byte cache line, segment field starts at bit 5) ; DCache_IndexBit = &04000000 (index field starts at bit 26) ; DCache_IndexSegStart = &000000E0 (start at index=0, segment = 7) ; Push "a2, ip" MOV ip, #0 LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c10, 2 ; clean DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer Pull "a2, ip" MOV pc, lr Cache_CleanInvalidateAll_WB_CR7_LDa ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "a2, ip" MOV ip, #0 LDRB a1, [ip, #DCache_LineLen] ; segment field starts at this bit LDR a2, [ip, #DCache_IndexBit] ; index field starts at this bit LDR ip, [ip, #DCache_IndexSegStart] ; starting value, with index at min, seg at max 10 MCR p15, 0, ip, c7, c14, 2 ; clean&invalidate DCache entry by segment/index ADDS ip, ip, a2 ; next index, counting up, CS if wrapped back to 0 BCC %BT10 SUBS ip, ip, a1 ; next segment, counting down, CC if wrapped back to max BCS %BT10 ; if segment wrapped, then we've finished MOV ip, #0 MCR p15, 0, ip, c7, c10, 4 ; drain WBuffer MCR p15, 0, ip, c7, c5, 0 ; invalidate ICache Pull "a2, ip" MOV pc, lr Cache_InvalidateAll_WB_CR7_LDa ROUT ; ; no clean, assume caller knows what's happening ; MOV a1, #0 MCR p15, 0, a1, c7, c7, 0 ; invalidate ICache and DCache MOV pc, lr Cache_RangeThreshold_WB_CR7_LDa ROUT MOV a1, #0 LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_CR7_LDa ROUT MMU_ChangingUncached_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; a1 = page affected (page aligned address) ; TLB_InvalidateEntry_WB_CR7_LDa ROUT MMU_ChangingUncachedEntry_WB_CR7_LDa MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MOV pc, lr WriteBuffer_Drain_WB_CR7_LDa ROUT MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV pc, lr IMB_Full_WB_CR7_LDa ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache ; Push "lr" BL Cache_CleanAll_WB_CR7_LDa ; also drains Wbuffer MOV a1, #0 MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_LDa ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_CR7_LDa Push "lr" MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry by VA MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "pc" MMU_Changing_WB_CR7_LDa ROUT Push "lr" BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_LDa ROUT Push "a2, lr" ADD a2, a1, #PageSize MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV lr, #0 MCR p15, 0, lr, c7, c10, 4 ; drain WBuffer SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_LDa ROUT Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize MOV a3, #0 LDR a3, [a3, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) MOV a3, #0 LDRB a3, [a3, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_LDa MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_LDa ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "a2" 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 Pull "a2" MOV pc, lr ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for StrongARM and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_Crd is Writeback data cache, clean by reading data from cleaner area ; Currently no support for mini data cache on some StrongARM variants. Mini ; cache is always writeback and must have cleaning support, so is very ; awkward to use for cacheable screen, say. ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; DCache_CleanBaseAddress : start address of total cleaner space ; DCache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered Cache_CleanAll_WB_Crd ROUT ; ; - cleans data cache (and invalidates it as a side effect) ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of StrongARM ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" MOV lr, #0 LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "a2-a4, v1, v2, pc" Cache_CleanInvalidateAll_WB_Crd ROUT IMB_Full_WB_Crd ; ;does not truly invalidate DCache, but effectively invalidates (flushes) all lines not ;involved in interrupts - this is sufficient for OS requirements, and means we don't ;have to disable interrupts for possibly slow clean ; Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" Cache_InvalidateAll_WB_Crd ; ; no clean, assume caller knows what is happening ; MCR p15, 0, a1, c7, c7, 0 ;flush ICache and DCache MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr Cache_RangeThreshold_WB_Crd MOV a1, #0 LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Crd MMU_ChangingUncached_WB_Crd MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr TLB_InvalidateEntry_WB_Crd MMU_ChangingUncachedEntry_WB_Crd MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB MOV pc, lr WriteBuffer_Drain_WB_Crd MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MOV pc, lr IMB_Range_WB_Crd ROUT SUB a2, a2, a1 CMP a2, #64*1024 ;arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Crd Push "lr" MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache Pull "pc" MMU_Changing_WB_Crd Push "lr" BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "pc" MMU_ChangingEntry_WB_Crd ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, lr CMP a1, a2 BLO %BT10 SUB a1, a1, #PageSize MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, pc" MMU_ChangingEntries_WB_Crd ROUT ; ;same comments as MMU_ChangingEntry_WB_Crd ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize MOV a3, #0 LDR a3, [a3, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) MOV a3, #0 LDRB a3, [a3, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ;clean DCache entry MCR p15, 0, a1, c7, c6, 1 ;flush DCache entry ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ;drain WBuffer MCR p15, 0, a1, c7, c5, 0 ;flush ICache MOV a1, lr ;restore start address 20 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "a2, a3, pc" ; 30 BL Cache_CleanAll_WB_Crd ;clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ;flush ICache MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Crd ROUT CMP a2, #32 ;arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ;flush DTLB entry ADD a1, a1, #PageSize SUBS lr, lr, #1 BNE %BT10 MCR p15, 0, a1, c8, c5, 0 ;flush ITLB Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ;flush ITLB and DTLB MOV pc, lr ; ARMops for XScale, mjs Feb 2001 ; ; WB_Cal_LD is writeback, clean with allocate, lockdown ; ; If the mini data cache is used (XScaleMiniCache true), it is assumed to be ; configured writethrough (eg. used for RISC OS screen memory). This saves an ugly/slow ; mini cache clean for things like IMB_Full. ; ; Sadly, for global cache invalidate with mini cache, things are awkward. We can't clean the ; main cache then do the global invalidate MCR, unless we tolerate having _all_ interrupts ; off (else the main cache may be slightly dirty from interrupts, and the invalidate ; will lose data). So we must reluctantly 'invalidate' the mini cache by the ugly/slow ; mechanism as if we were cleaning it :-( Intel should provide a separate global invalidate ; (and perhaps a line allocate) for the mini cache. ; ; We do not use lockdown. ; ; For simplicity, we assume cacheable pages are mostly writeback. Any writethrough ; pages will be invalidated as if they were writeback, but there is little overhead ; (cleaning a clean line or allocating a line from cleaner area are both fast). ; Global cache cleaning requires address space for private cleaner areas (not accessed ; for any other reason). Cleaning is normally with interrupts enabled (to avoid a latency ; hit), which means that the cleaner data is not invalidated afterwards. This is fine for ; RISC OS - where the private area is not used for anything else, and any re-use of the ; cache under interrupts is safe (eg. a page being moved is *never* involved in any ; active interrupts). ; Mostly, cleaning toggles between two separate cache-sized areas, which gives minimum ; cleaning cost while guaranteeing proper clean even if previous clean data is present. If ; the clean routine is re-entered, an independent, double sized clean is initiated. This ; guarantees proper cleaning (regardless of multiple re-entrancy) whilst hardly complicating ; the routine at all. The overhead is small, since by far the most common cleaning will be ; non-re-entered. The upshot is that the cleaner address space available must be at least 4 ; times the cache size: ; 1 : used alternately, on 1st, 3rd, ... non-re-entered cleans ; 2 : used alternately, on 2nd, 4th, ... non-re-entered cleans ; 3 : used only for first half of a re-entered clean ; 4 : used only for second half of a re-entered clean ; ; If the mini cache is used, it has its own equivalent cleaner space and algorithm. ; Parameters for each cache are: ; ; Cache_CleanBaseAddress : start address of total cleaner space ; Cache_CleanNextAddress : start address for next non-re-entered clean, or 0 if re-entered GBLL XScaleMiniCache ; *must* be configured writethrough if used XScaleMiniCache SETL {FALSE} ; MACRO to do Intel approved CPWAIT, to guarantee any previous MCR's have taken effect ; corrupts a1 ; MACRO CPWAIT MRC p15, 0, a1, c2, c0, 0 ; arbitrary read of CP15 MOV a1, a1 ; wait for it ; SUB pc, pc, #4 omitted, because all ops have a pc load to return to caller MEND Cache_CleanAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates as a side effect) ; - if mini cache is in use, will be writethrough so no clean required ; - can be used with interrupts enabled (to avoid latency over time of clean) ; - can be re-entered ; - see remarks at top of XScale ops for discussion of strategy ; Push "a2-a4, v1, v2, lr" MOV lr, #0 LDR a1, [lr, #DCache_CleanBaseAddress] LDR a2, =DCache_CleanNextAddress LDR a3, [lr, #DCache_Size] LDRB a4, [lr, #DCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 MCR p15, 0, v1, c7, c2, 5 ; allocate address from cleaner space ADD v1, v1, a4 TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Cache_Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "a2-a4, v1, v2, pc" [ XScaleMiniCache Cache_MiniInvalidateAll_WB_Cal_LD ROUT ; ; similar to Cache_CleanAll_WB_Cal_LD, but must do direct reads (cannot use allocate address MCR), and ; 'cleans' to achieve invalidate as side effect (mini cache will be configured writethrough) ; Push "a2-a4, v1, v2, lr" MOV lr, #0 LDR a1, [lr, #MCache_CleanBaseAddress] LDR a2, =MCache_CleanNextAddr LDR a3, [lr, #MCache_Size] LDRB a4, [lr, #MCache_LineLen] MOV v2, #0 SWP v1, v2, [a2] ; read current CleanNextAddr, zero it (semaphore) TEQ v1, #0 ; but if it is already zero, we have re-entered ADDEQ v1, a1, a3, LSL #1 ; if re-entered, start clean at Base+2*Cache_Size ADDEQ v2, v1, a3, LSL #1 ; if re-entered, do a clean of 2*Cache_Size ADDNE v2, v1, a3 ; if not re-entered, do a clean of Cache_Size 10 LDR lr, [v1], a4 ; read a line of cleaner data TEQ v1, v2 BNE %BT10 ADD v2, a1, a3, LSL #1 ; compare end address with Base+2*Size CMP v1, v2 MOVEQ v1, a1 ; if equal, not re-entered and Next wraps back STRLS v1, [a2] ; if lower or same, not re-entered, so update Next ; note, no drain WBuffer, since we are really only invalidating a writethrough cache Pull "a2-a4, v1, v2, pc" ] ; XScaleMiniCache Cache_CleanInvalidateAll_WB_Cal_LD ROUT ; ; - cleans main cache (and invalidates wrt OS stuff as a side effect) ; - if mini cache in use (will be writethrough), 'cleans' in order to invalidate as side effect ; Push "lr" BL Cache_CleanAll_WB_Cal_LD [ XScaleMiniCache BL Cache_MiniInvalidateAll_WB_Cal_LD ] MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" Cache_InvalidateAll_WB_Cal_LD ROUT ; ; no clean, assume caller knows what's happening ; MCR p15, 0, a1, c7, c7, 0 ; invalidate DCache, (MiniCache), ICache and BTB CPWAIT MOV pc, lr Cache_RangeThreshold_WB_Cal_LD ROUT MOV a1, #0 LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_Cal_LD ROUT MMU_ChangingUncached_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr TLB_InvalidateEntry_WB_Cal_LD ROUT MMU_ChangingUncachedEntry_WB_Cal_LD MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry CPWAIT MOV pc, lr WriteBuffer_Drain_WB_Cal_LD ROUT MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) MOV pc, lr IMB_Full_WB_Cal_LD Push "lr" BL Cache_CleanAll_WB_Cal_LD ; clean DCache (wrt to non-interrupt stuff) MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB CPWAIT Pull "pc" IMB_Range_WB_Cal_LD ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; arbitrary-ish range threshold ADD a2, a2, a1 BHS IMB_Full_WB_Cal_LD Push "lr" MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (waits, so no need for CPWAIT) Pull "pc" MMU_Changing_WB_Cal_LD ROUT Push "lr" BL Cache_CleanAll_WB_Cal_LD MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "pc" MMU_ChangingEntry_WB_Cal_LD ROUT ; ;there is no clean&invalidate DCache instruction, however we can do clean ;entry followed by invalidate entry without an interrupt hole, because they ;are for the same virtual address (and that virtual address will not be ;involved in interrupts, since it is involved in remapping) ; Push "a2, lr" ADD a2, a1, #PageSize MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, lr CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry CPWAIT Pull "a2, pc" MMU_ChangingEntries_WB_Cal_LD ROUT ; ;same comments as MMU_ChangingEntry_WB_Cal_LD ; Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize MOV a3, #0 LDR a3, [a3, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) MOV a3, #0 LDRB a3, [a3, #DCache_LineLen] MOV lr, a1 10 MCR p15, 0, a1, c7, c10, 1 ; clean DCache entry MCR p15, 0, a1, c7, c6, 1 ; invalidate DCache entry [ :LNOT:XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry ] ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer [ XScaleJTAGDebug MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache and BTB | MCR p15, 0, a1, c7, c5, 6 ; invalidate BTB ] MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 CPWAIT Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_Cal_LD MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT Pull "a2, a3, pc" MMU_ChangingUncachedEntries_WB_Cal_LD ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "lr" MOV lr, a2 10 MCR p15, 0, a1, c8, c6, 1 ; invalidate DTLB entry MCR p15, 0, a1, c8, c5, 1 ; invalidate ITLB entry SUBS lr, lr, #1 ADD a1, a1, #PageSize BNE %BT10 CPWAIT Pull "pc" ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB CPWAIT MOV pc, lr ; -------------------------------------------------------------------------- ; ----- ARMops for Cortex-A8 and the like ---------------------------------- ; -------------------------------------------------------------------------- ; WB_CR7_Lx refers to ARMs with writeback data cache, cleaned with ; register 7, and (potentially) multiple cache levels ; ; DCache_LineLen = log2(line len)-2 for smallest data/unified cache line length ; ICache_LineLen = log2(line len)-2 for smallest instruction cache line length ; DCache_RangeThreshold = clean threshold for data cache ; Cache_Lx_Info = Cache level ID register ; Cache_Lx_DTable = Cache size identification register for all 8 data/unified caches ; Cache_Lx_ITable = Cache size identification register for all 8 instruction caches Cache_CleanAll_WB_CR7_Lx ROUT ; ; Currently disables interrupts to allow safe programming and reading of cache size selection register ; ; Clean cache by traversing all sets and ways for all data caches Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" MOV lr, #ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in upper 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size) MOV v3, v3, LSR #15 ; Start at bit 2 MOV v3, v3, LSL v1 ; Start at log2(Line size) ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c10, 2 ; Clean ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_CleanInvalidateAll_WB_CR7_Lx ROUT ; ; similar to Cache_CleanAll, but does clean&invalidate of Dcache, and invalidates ICache ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" MOV lr, #ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in upper 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size) MOV v3, v3, LSR #15 ; Start at bit 2 MOV v3, v3, LSL v1 ; Start at log2(Line size) ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c14, 2 ; Clean & invalidate ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_InvalidateAll_WB_CR7_Lx ROUT ; ; no clean, assume caller knows what's happening ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" MOV lr, #ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info BIC a1, a1, #&FF000000 ; Discard unification/coherency bits MOV a2, #0 ; Current cache level 20 TST a1, #7 ; Get flags BEQ %FT10 ; Cache clean complete LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in upper 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size) MOV v3, v3, LSR #15 ; Start at bit 2 MOV v3, v3, LSL v1 ; Start at log2(Line size) ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c6, 2 ; Invalidate ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 MOVS a1, a1, LSR #3 BNE %BT20 10 MOV a1, #0 MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" Cache_RangeThreshold_WB_CR7_Lx ROUT MOV a1, #0 LDR a1, [a1, #DCache_RangeThreshold] MOV pc, lr TLB_InvalidateAll_WB_CR7_Lx ROUT MMU_ChangingUncached_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; a1 = page affected (page aligned address) ; TLB_InvalidateEntry_WB_CR7_Lx ROUT MMU_ChangingUncachedEntry_WB_CR7_Lx MCR p15, 0, a1, c8, c7, 1 ; invalidate ITLB & DTLB entry MOV pc, lr WriteBuffer_Drain_WB_CR7_Lx ROUT MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV pc, lr IMB_Full_WB_CR7_Lx ROUT ; ; do: clean DCache; drain WBuffer, invalidate ICache ; Luckily, we only need to clean as far as the level of unification ; Push "a2,a3,a4,v1,v2,v3,v4,v5,lr" MOV lr, #ZeroPage LDR a1, [lr, #Cache_Lx_Info]! ADD lr, lr, #Cache_Lx_DTable-Cache_Lx_Info MOV a1, a1, LSR #27 AND a1, a1, #&7 ; Get level of unification MOV a2, #0 ; Current cache level SUBS a1, a1, #1 BLT %FT10 ; Cache clean complete 20 LDR a3, [lr], #4 ; Get size info AND v1, a3, #&7 ; log2(Line size)-2 BIC a3, a3, #&F0000007 ; Clear flags & line size MOV v2, a3, LSL #19 ; Number of ways-1 in upper 10 bits MOV v3, a3, LSR #13 ; Number of sets-1 in upper 15 bits ; Way number needs to be packed right up at the high end of the data word; shift it up CLZ a4, v2 MOV v2, v2, LSL a4 ; Set number needs to start at log2(Line size) MOV v3, v3, LSR #15 ; Start at bit 2 MOV v3, v3, LSL v1 ; Start at log2(Line size) ; Now calculate the offset numbers we will use to increment sets & ways BIC v4, v2, v2, LSL #1 ; Way increment BIC v5, v3, v3, LSL #1 ; Set increment ; Now we can finally clean this cache! ORR a3, a2, v3 ; Current way (0), set (max), and level 30 MCR p15, 0, a3, c7, c10, 2 ; Clean ADDS a3, a3, v4 ; Increment way BCC %BT30 ; Overflow will occur once ways are enumerated TST a3, v3 ; Are set bits all zero? SUBNE a3, a3, v5 ; No, so decrement set and loop around again BNE %BT30 ; This cache is now clean. Move on to the next level. ADD a2, a2, #2 SUBS a1, a1, #1 BGE %BT20 10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (is this required?) MCR p15, 0, a1, c7, c5, 0 ; invalidate ICache Pull "a2,a3,a4,v1,v2,v3,v4,v5,pc" ; a1 = start address (inclusive, cache line aligned) ; a2 = end address (exclusive, cache line aligned) ; IMB_Range_WB_CR7_Lx ROUT SUB a2, a2, a1 CMP a2, #32*1024 ; Maximum L1 cache size on Cortex-A8 is 32K, use that to guess what approach to take BGE IMB_Full_WB_CR7_Lx Push "a3,lr" MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a3, #4 MOV lr, a3, LSL lr 10 MCR p15, 0, a1, c7, c11, 1 ; clean DCache entry by VA to PoU MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry (to PoC - is this bad?) ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer (required?) Pull "a3,pc" MMU_Changing_WB_CR7_Lx ROUT Push "lr" BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "pc" ; a1 = page affected (page aligned address) ; MMU_ChangingEntry_WB_CR7_Lx ROUT Push "a2, lr" MOV lr, #0 LDRB lr, [lr, #DCache_LineLen] ; log2(line len)-2 MOV a2, #4 MOV lr, a2, LSL lr ADD a2, a1, #PageSize 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, lr CMP a1, a2 BLO %BT10 MOV lr, #0 MCR p15, 0, lr, c7, c10, 4 ; drain WBuffer SUB a1, a1, #PageSize MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB and ITLB Pull "a2, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingEntries_WB_CR7_Lx ROUT Push "a2, a3, lr" MOV a2, a2, LSL #Log2PageSize MOV a3, #0 LDR a3, [a3, #DCache_RangeThreshold] ;check whether cheaper to do global clean CMP a2, a3 BHS %FT30 ADD a2, a2, a1 ;clean end address (exclusive) MOV a3, #0 LDRB a3, [a3, #DCache_LineLen] ; log2(line len)-2 MOV lr, #4 MOV a3, lr, LSL a3 MOV lr, a1 10 MCR p15, 0, a1, c7, c14, 1 ; clean&invalidate DCache entry to PoC MCR p15, 0, a1, c7, c5, 1 ; invalidate ICache entry to PoC ADD a1, a1, a3 CMP a1, a2 BLO %BT10 MOV a1, #0 MCR p15, 0, a1, c7, c10, 4 ; drain WBuffer MOV a1, lr ; restore start address 20 MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD a1, a1, #PageSize CMP a1, a2 BLO %BT20 Pull "a2, a3, pc" ; 30 BL Cache_CleanInvalidateAll_WB_CR7_Lx MOV a1, #0 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB Pull "a2, a3, pc" ; a1 = first page affected (page aligned address) ; a2 = number of pages ; MMU_ChangingUncachedEntries_WB_CR7_Lx ROUT CMP a2, #32 ; arbitrary-ish threshold BHS %FT20 Push "a2" 10 MCR p15, 0, a1, c8, c7, 1 ; invalidate DTLB & ITLB entry ADD a1, a1, #PageSize SUBS a2, a2, #1 BNE %BT10 Pull "a2" MOV pc, lr ; 20 MCR p15, 0, a1, c8, c7, 0 ; invalidate ITLB and DTLB MOV pc, lr ; -------------------------------------------------------------------------- ; IMPORT Write0_Translated ARM_PrintProcessorType MOV a1, #ZeroPage LDRB a1, [a1, #ProcessorType] TEQ a1, #ARMunk MOVEQ pc, lr Push "lr" ADR a2, PNameTable LDR a1, [a2, a1, LSL #1] MOV a1, a1, LSL #16 ADD a1, a2, a1, LSR #16 BL Write0_Translated SWI XOS_NewLine SWI XOS_NewLine Pull "pc" PNameTable DCW PName_ARM600 - PNameTable DCW PName_ARM610 - PNameTable DCW PName_ARM700 - PNameTable DCW PName_ARM710 - PNameTable DCW PName_ARM710a - PNameTable DCW PName_SA110 - PNameTable ; pre rev T DCW PName_SA110 - PNameTable ; rev T or later DCW PName_ARM7500 - PNameTable DCW PName_ARM7500FE - PNameTable DCW PName_SA1100 - PNameTable DCW PName_SA1110 - PNameTable DCW PName_ARM720T - PNameTable DCW PName_ARM920T - PNameTable DCW PName_ARM922T - PNameTable DCW PName_X80200 - PNameTable DCW PName_X80321 - PNameTable DCW PName_Cortex_A8 - PNameTable PName_ARM600 = "600:ARM 600 Processor",0 PName_ARM610 = "610:ARM 610 Processor",0 PName_ARM700 = "700:ARM 700 Processor",0 PName_ARM710 = "710:ARM 710 Processor",0 PName_ARM710a = "710a:ARM 710a Processor",0 PName_SA110 = "SA110:SA-110 Processor",0 PName_ARM7500 = "7500:ARM 7500 Processor",0 PName_ARM7500FE = "7500FE:ARM 7500FE Processor",0 PName_SA1100 = "SA1100:SA-1100 Processor",0 PName_SA1110 = "SA1110:SA-1110 Processor",0 PName_ARM720T = "720T:ARM 720T Processor",0 PName_ARM920T = "920T:ARM 920T Processor",0 PName_ARM922T = "922T:ARM 922T Processor",0 PName_X80200 = "X80200:80200 Processor",0 PName_X80321 = "X80321:80321 Processor",0 PName_Cortex_A8 = "CortexA8:Cortex-A8 Processor",0 ALIGN ; Lookup tables from DA flags PCB (bits 14:12,5,4, packed down to 4:2,1,0) ; to XCB bits in page table descriptors. XCB_NB * 1:SHL:0 XCB_NC * 1:SHL:1 XCB_P * 1:SHL:2 ALIGN 32 ; WT read-allocate cache (eg ARM720T) XCBTableWT ; C+B CNB NCB NCNB = L2_C+L2_B, L2_C, L2_B, 0 ; Default = L2_C+L2_B, L2_C, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, L2_C, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X = L2_C+L2_B, L2_C, L2_B, 0 ; X, X, X, X ; SA-110 in Risc PC - WB only read-allocate cache, non-merging WB XCBTableSA110 = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; ARMv5 WB/WT read-allocate cache, non-merging WB (eg ARM920T) XCBTableWBR = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; SA-1110 - WB only read allocate cache, merging WB, mini D-cache XCBTableSA1110 = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_B, 0, 0, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to read-allocate XCBTableXScaleRA = L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X ; XScale - WB/WT read or write-allocate cache, merging WB, mini D-cache ; defaulting to write-allocate XCBTableXScaleWA = L2_X+L2_C+L2_B, 0, L2_B, 0 ; Default = L2_C , 0, L2_X+L2_B, 0 ; WT, X, Non-merging, X = L2_C+L2_B, 0, L2_B, 0 ; WB/RA, X, Merging, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; WB/WA, X, X, X = L2_X+L2_C , 0, L2_B, 0 ; Alt DCache, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X = L2_X+L2_C+L2_B, 0, L2_B, 0 ; X, X, X, X END