Commit b47fdbb1 authored by Jeffrey Lee's avatar Jeffrey Lee
Browse files

Add OS_Memory 19, which is intended to replace the OS_Memory 0 "make...

Add OS_Memory 19, which is intended to replace the OS_Memory 0 "make uncacheable" feature, when used for DMA

  Making pages uncacheable to allow them to be used with DMA can be troublesome for a number of reasons:
  * Many processors ignore cache hits for non-cacheable pages, so to avoid breaking any IRQ handlers the page table manipulation + cache maintenance must be performed with IRQs disabled, impacting the IRQ latency of the system
  * Some processors don't support LDREX/STREX to non-cacheable pages
  * In SMP setups it may be necessary to temporarily park the other cores somewhere safe, or perform some other explicit synchronisation to make sure they all have consistent views of the cache/TLB
  The above issues are most likely to cause problems when the page is shared by multiple programs; a DMA operation which targets one part of a page could impact the programs which are using the other parts.
  To combat these problems, OS_Memory 19 is being introduced, which allows DMA cache coherency/address translation to be performed without altering the attributes of the pages.
  Files changed:
  - hdr/OSMem - Add definitions for OS_Memory 19
  - s/MemInfo - Add OS_Memory 19 implementation
  Tested on Raspberry Pi 3, iMx6

Version 5.86, Tagged as 'Kernel-5_86-4_129_2_3'
parent 9944afaf
......@@ -13,11 +13,11 @@
GBLS Module_ComponentPath
Module_MajorVersion SETS "5.86"
Module_Version SETA 586
Module_MinorVersion SETS ""
Module_Date SETS "29 Jul 2017"
Module_ApplicationDate SETS "29-Jul-17"
Module_MinorVersion SETS ""
Module_Date SETS "12 Aug 2017"
Module_ApplicationDate SETS "12-Aug-17"
Module_ComponentName SETS "Kernel"
Module_ComponentPath SETS "castle/RiscOS/Sources/Kernel"
Module_FullVersion SETS "5.86 ("
Module_HelpVersion SETS "5.86 (29 Jul 2017)"
Module_FullVersion SETS "5.86 ("
Module_HelpVersion SETS "5.86 (12 Aug 2017)"
......@@ -5,19 +5,19 @@
#define Module_MajorVersion_CMHG 5.86
#define Module_MinorVersion_CMHG
#define Module_Date_CMHG 29 Jul 2017
#define Module_MinorVersion_CMHG
#define Module_Date_CMHG 12 Aug 2017
#define Module_MajorVersion "5.86"
#define Module_Version 586
#define Module_MinorVersion ""
#define Module_Date "29 Jul 2017"
#define Module_MinorVersion ""
#define Module_Date "12 Aug 2017"
#define Module_ApplicationDate "29-Jul-17"
#define Module_ApplicationDate "12-Aug-17"
#define Module_ComponentName "Kernel"
#define Module_ComponentPath "castle/RiscOS/Sources/Kernel"
#define Module_FullVersion "5.86 ("
#define Module_HelpVersion "5.86 (29 Jul 2017)"
#define Module_FullVersion "5.86 ("
#define Module_HelpVersion "5.86 (12 Aug 2017)"
#define Module_LibraryVersionInfo "5:86"
......@@ -109,6 +109,7 @@ OSMemReason_ReleasePhysAddr * 15 ; Release the temp mapping
OSMemReason_MemoryAreaInfo * 16 ; Return size & location of various non-DA areas
OSMemReason_MemoryAccessPrivileges * 17 ; Decode AP numbers into permission flags
OSMemReason_FindAccessPrivilege * 18 ; Find best AP number from given permission flags
OSMemReason_DMAPrep * 19 ; Convert PA <-> LA, perform cache maintenance required for DMA
OSMemReason_CheckMemoryAccess * 24 ; Return attributes/permissions for a logical address range
; OS_Memory 17/18 permission flags
......@@ -119,6 +120,12 @@ MemPermission_PrivX * 1<<3 ; Executable in privileged modes
MemPermission_PrivW * 1<<4 ; Writable in privileged modes
MemPermission_PrivR * 1<<5 ; Readable in privileged modes
; OS_Memory 19 (DMAPrep) flags
DMAPrep_PhysProvided * 1<<8 ; Input function provides physical addresses, not logical
DMAPrep_Write * 1<<9 ; DMA is writing to RAM
DMAPrep_End * 1<<10 ; DMA is complete, perform any post-op cache maintenance
DMAPrep_UseBounceBuffer * 1 ; Input/output function flag: Must use bounce buffer for this block
; OS_Memory 24 (CheckMemoryAccess) flags
CMA_Completely_UserR * 1<<0 ; completely readable in user mode
CMA_Completely_UserW * 1<<1 ; completely writable in user mode
......@@ -69,7 +69,7 @@ MemReturn
B MemoryAreaInfo ; 16
B MemoryAccessPrivileges ; 17
B FindAccessPrivilege ; 18
B %BT20 ; 19 |
B DMAPrep ; 19
B %BT20 ; 20 | Reserved for us
B %BT20 ; 21 |
B %BT20 ; 22 |
......@@ -1009,6 +1009,8 @@ ReleasePhysAddr
BL RISCOS_ReleasePhysicalAddress
Pull "r0-r3,r12,pc"
; In: r0 = flags
......@@ -1324,6 +1326,354 @@ FindAccessPrivilege ROUT
MakeErrorBlock AccessPrivilegeNotFound
; In: r0 = flags
; bit meaning
; 0-7 19 (reason code)
; 8 Input function provides physical addresses
; 9 DMA is writing to RAM
; 10 DMA is complete, perform any post-op cache maintenance
; 11-31 reserved (set to 0)
; r1 = R12 value to provide to called functions
; r2 = Initial R9 value to provide to input function
; r3 -> Input function:
; in: r9 = r2 from SWI / value from previous call
; r12 = r1 from SWI
; out: r0 = start address of region
; r1 = length of region (0 if end of transfer)
; r2 = flags:
; bit 0: Bounce buffer will be used
; r9 = new r9 for next input call
; r12 corrupt
; r4 = Initial R9 value to provide to output function
; r5 -> Output function (if bit 10 of R0 clear):
; in: r0 = logical address of start of region
; r1 = physical address of start of region
; r2 = length of region
; r3 = flags:
; bit 0: Bounce buffer must be used
; r9 = r4 from SWI / value from previous call
; r12 = r1 from SWI
; out: r9 = new r9 value for next output call
; r0-r3, r12 corrupt
; Out: r2, r4 updated to match values returned by input/output calls
; All other regs preserved
; Performs address translation and cache maintenance necessary to allow for DMA
; to be performed to/from cacheable memory.
; To allow Service_PagesUnsafe to be dealt with in a straightforward manner, we
; have to be careful not to cache the results of any address translations over
; calls to the input/output functions. E.g. if the output function tries to
; allocate from PCI RAM, that may trigger claiming of a specific page by the
; PCI DA, potentially invalidating any existing logical -> physical translation.
; This restriction hampers the routines ability to merge together input and
; output blocks, and to perform minimal cache maintenance. However for typical
; scatter lists of low to medium complexity it should still produce acceptable
; output.
; Note that if the input function provides physical addresses, the caller must
; take care to abort the entire operation if one of the physical pages involved
; in the request becomes claimed by someone else while the OS_Memory call is in
; progress. This is because we have no sensible way of dealing with this case
; ourselves (even if we didn't attempt to call the input function multiple times
; and merge together the blocks, we'd still have to buffer things internally to
; deal with when blocks need splitting for cache alignment)
; Internally, blocks are stored in the following format:
; Word 0 = Start logical address (incl.)
; Word 1 = Logical -> physical address offset (low bits) + flags (high bits)
; Word 2 = End logical address (excl.)
; This minimises the number of registers needed to hold a block, and simplifies
; the merge calculation (blocks can be merged if words 2 + 1 of first block
; match words 0 + 1 of second block)
; Workspace struct that's stored on the stack
^ 0
DMAPrepW_InHold # 12
DMAPrepW_InChunk # 12
DMAPrepW_PhyChunk # 12
DMAPrepW_CacheMask # 4 ; Cache line length - 1
DMAPrepW_ARMop # 4 ; Cache maintenenace ARMop to use
DMAPrepW_CamEntriesPointer # 4 ; CamEntriesPointer copy
DMAPrepW_Size # 0
; These next few correspond directly to the input registers in the stack frame
DMAPrepW_Flags # 4
DMAPrepW_R12 # 4
DMAPrepW_InR9 # 4
DMAPrepW_InFunc # 4
DMAPrepW_OutR9 # 4
DMAPrepW_OutFunc # 4
DMAPrep_FlagOffset * 20
DMAPrep_NonCacheable * 1:SHL:21 ; Internal flag used for tracking non-cacheable pages
CMP r0, #1<<11
BHS MemoryBadParameters
; The end of a read from RAM is a no-op (no cache maintenance required)
AND r11, r0, #DMAPrep_Write :OR: DMAPrep_End
TEQ r11, #DMAPrep_End
MOVEQ pc, lr
Entry "r0-r9", DMAPrepW_Size
; Determine the cache maintenance function we need to use
CMP r11, #DMAPrep_Write
LDR r10, =ZeroPage
ASSERT DMAPrep_End > DMAPrep_Write
LDRLE r11, [r10, #Proc_Cache_CleanRange] ; Start of DMA (read or write)
LDRGT r11, [r10, #Proc_Cache_InvalidateRange] ; End of DMA write
STR r11, [sp, #DMAPrepW_ARMop]
; Get the params needed for address translation
LDR r6, [r10, #CamEntriesPointer]
LDR r7, [r10, #MaxCamEntry]
LDR r8, =L2PT
; Init workspace
STR r6, [sp, #DMAPrepW_CamEntriesPointer]
; Get the cache line mask value
[ MEMM_Type == "ARM600"
LDRB r1, [r10, #DCache_LineLen]
; Yuck, need to probe for the last cache level
MOV r5, #Cache_Lx_MaxLevel-1
MOV r1, r5
ARMop Cache_Examine,,,r10
CMP r1, #0
SUBEQ r5, r5, #1
CMP r3, r1
MOVHI r1, r3
SUB r1, r1, #1
STR r1, [sp, #DMAPrepW_CacheMask]
; Get initial input region
BL DMAPrep_CallInputFunc
CMP r0, r3
STMIA lr, {r0, r2, r3}
; Get another input region, see if we can merge it with InChunk
BL DMAPrep_CallInputFunc
CMP r0, r3
LDMIB lr, {r4, r5}
CMP r4, r2
CMPEQ r5, r0
STREQ r3, [lr, #8]
; Can't merge this region, store it in InHold
ASSERT DMAPrepW_InHold = DMAPrepW_InChunk-12
STMDB lr, {r0, r2, r3}
; Perform address translation for the start of InChunk
LDR r4, [sp, #DMAPrepW_InChunk]
BL DMAPrep_Translate
; Store in PhyChunk
ADD lr, sp, #DMAPrepW_PhyChunk
STMIA lr, {r4-r6}
; Align start to cache boundary
TST r5, #DMAPrep_NonCacheable+(DMAPrep_UseBounceBuffer :SHL: DMAPrep_FlagOffset)
LDR lr, [sp, #DMAPrepW_Flags]
LDR r10, [sp, #DMAPrepW_CacheMask]
TST lr, #DMAPrep_Write
TSTNE r4, r10
; Unaligned write to cacheable memory -> bounce required
ADD r1, r4, r10
BIC r1, r1, r10 ; End of current cache line
; Only round down to end of current cache line if the end of the chunk
; is at or beyond the end of the next cache line
ADD r2, r1, r10 ; Last byte we can accept without needing to truncate
CMP r6, r2
MOVHI r6, r1 ; Truncate!
ORR r5, r5, #DMAPrep_UseBounceBuffer :SHL: DMAPrep_FlagOffset
B %FT40
; Start doesn't need splitting, so translate + append more pages
ADD lr, sp, #DMAPrepW_InChunk
ASSERT DMAPrepW_PhyChunk = DMAPrepW_InChunk + 12
LDMIA lr, {r0-r2, r4-r6}
SUB r3, r6, r4 ; Length of translated region
SUB r2, r2, r0 ; Length of input region
CMP r3, r2
ADD r4, r0, r3 ; Translate next address in input address space
BL DMAPrep_Translate
; Try and merge with PhyChunk
ADD lr, sp, #DMAPrepW_PhyChunk
LDMIB lr, {r0, r1}
CMP r0, r5
CMPEQ r1, r4
STREQ r6, [sp, #DMAPrepW_PhyChunk + 8]
LDMIA lr, {r4-r6}
; Can't merge any more pages into this chunk {r4-r6}
; Truncate / bounce the end if necessary
TST r5, #DMAPrep_NonCacheable+(DMAPrep_UseBounceBuffer :SHL: DMAPrep_FlagOffset)
LDR lr, [sp, #DMAPrepW_Flags]
LDR r10, [sp, #DMAPrepW_CacheMask]
TST lr, #DMAPrep_Write
TSTNE r6, r10
; Unaligned write to cacheable memory -> bounce required
BIC r3, r6, r10
CMP r3, r4
ORREQ r5, r5, #DMAPrep_UseBounceBuffer :SHL: DMAPrep_FlagOffset ; Bounce
MOVNE r6, r3 ; Truncate
; Perform cache maintenance if necessary
; For safety we always perform this before calling the output function, rather than caching and attempting to merge the regions (output function may alter cacheability of pages?)
TST r5, #DMAPrep_NonCacheable+(DMAPrep_UseBounceBuffer :SHL: DMAPrep_FlagOffset)
ADD r1, r6, r10
BIC r0, r4, r10
BIC r1, r1, r10
MOV lr, pc
LDR pc, [sp, #DMAPrepW_ARMop]
; Call the output function
LDR lr, [sp, #DMAPrepW_Flags]
TST lr, #DMAPrep_End
BNE %FT60 ; No output func for end-of-op
MOV r0, r4
ADD r1, r4, r5, LSL #12
SUB r2, r6, r4
MOV r3, r5, LSR #DMAPrep_FlagOffset
LDR r12, [sp, #DMAPrepW_R12]
AND r3, r3, #DMAPrep_UseBounceBuffer ; Mask out internal flags
ADD r9, sp, #DMAPrepW_OutR9
CLRV ; Ensure V is clear on entry so simple functions don't confuse us
MOV lr, pc
ASSERT DMAPrepW_OutFunc = DMAPrepW_OutR9 + 4
LDMIA r9, {r9, pc} ; Call output function
STR r9, [sp, #DMAPrepW_OutR9] ; Always write back updated R9
; Advance InChunk by the length of {r4-r6}
LDR r0, [sp, #DMAPrepW_InChunk]
ADD r0, r0, r6
LDR r1, [sp, #DMAPrepW_InChunk+8]
SUB r0, r0, r4
STR r0, [sp, #DMAPrepW_InChunk]
CMP r0, r1
; InChunk depleted, copy InHold to InChunk and try for more input
ADD lr, sp, #DMAPrepW_InChunk
ASSERT DMAPrepW_InHold = 0
LDMIA sp, {r0,r2,r3}
CMP r0, r3
; InHold was empty, so no more regions to process
ADRL r0, ErrorBlock_BadAddress
B %BT90
B MemoryBadParameters
; Out: R0, R2, R3 = block
; LR -> InChunk
; R1, R4, R9, R12 corrupt
MOV r4, lr ; Avoid pushing onto stack, to simplify workspace indexing and error handling
LDR r12, [sp, #DMAPrepW_R12]
ADD r9, sp, #DMAPrepW_InR9
CLRV ; Ensure V is clear on entry so simple functions don't confuse us
MOV lr, pc
ASSERT DMAPrepW_InFunc = DMAPrepW_InR9 + 4
LDMIA r9, {r9, pc} ; Call the input function
STR r9, [sp, #DMAPrepW_InR9] ; Always write back updated R9
CMP r2, #DMAPrep_UseBounceBuffer
; Pack into InChunk
MOV r2, r2, LSL #DMAPrep_FlagOffset
ADD lr, sp, #DMAPrepW_InChunk
ADD r3, r0, r1
MOV pc, r4
; Translate the start of InChunk into a block
; In: r4 = Address to translate
; r7 = MaxCamEntry
; r8 -> L2PT
; Out: r4, r5, r6 = block
; r1, r3, r9-r12 corrupt
MOV r1, lr
LDR r12, [sp, #DMAPrepW_InChunk+8]
SUB r12, r12, r4 ; Length of input region
LDR lr, [sp, #DMAPrepW_Flags]
LDR r6, [sp, #DMAPrepW_CamEntriesPointer]
TST lr, #DMAPrep_PhysProvided
[ AMB_LazyMapIn
MOV r9, r0
MOV r0, r4
BL AMB_MakeHonestLA
MOV r0, r9
BL logical_to_physical ; r4, r8 -> r5
BLCC physical_to_ppn ; r5, r7 -> r3
; r9-r11 corrupt
; Grab page flags
ADD lr, r6, r3, LSL #CAM_EntrySizeLog2
LDR lr, [lr, #CAM_PageFlags]
B %FT30
MOV r5, r4
BL physical_to_ppn ; r5, r7 -> r3
; r9-r11 corrupt
; Manual ppn -> logical so we can get the page flags at the same time
; TODO this won't deal with mapped out pages in a sensible manner (will output them all individually)
[ AMB_LazyMapIn
MOV r9, r0
MOV r0, r3
BL AMB_MakeHonestPN
MOV r0, r9
ADD lr, r6, r3, LSL #CAM_EntrySizeLog2
ASSERT CAM_PageFlags=4
LDMIA lr, {r3, lr}
; Merge in the offset within the page
MOV r3, r3, LSR #12
ORR r4, r3, r4, LSL #20
MOV r4, r4, ROR #20
LDR r3, [sp, #DMAPrepW_InChunk+4]
; Combine the cacheability + phys offset into r5
SUB r5, r5, r4
TST lr, #DynAreaFlags_NotCacheable
ORR r5, r3, r5, LSR #12
ORRNE r5, r5, #DMAPrep_NonCacheable
; Work out how much of r12 fits into this page
; This is done by comparing against the length of the input region,
; since the input could be logical or physical
ADD r6, r4, #4096
MOV r6, r6, LSR #12
RSB r6, r4, r6, LSL #12
CMP r6, r12
MOVHI r6, r12
ADD r6, r4, r6
MOV pc, r1
; In: r0 = flags
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment