Commit 2aaf1abc authored by Ben Avison's avatar Ben Avison
Browse files

Partial rewrite of SDHOST driver to use DMA for data transfer

This reverts the microSD performance to that seen with the SDHCI controller.

Benchmarks on Pi Zero W (ARM1176JZF-S):
  SDHCI:      23.8 MB/s
  SDHOST PIO:  1.9 MB/s
  SDHOST DMA: 24.4 MB/s

Benchmarks on Pi Zero 2W (Cortex-A53):
  SDHCI:      23.8 MB/s
  SDHOST PIO:  8.6 MB/s
  SDHOST DMA: 24.4 MB/s
parent 11f19bb8
Pipeline #4040 passed with stages
in 33 seconds
......@@ -17,7 +17,7 @@
COMPONENT = BCM2835 HAL
TARGET = BCM2835
OBJS = Top CLib CMOS Debug Interrupts SDIO Timers UART USB Video DMA Messaging GPIO VCHIQ IIC RTC SPI Touch KbdScan DBell IntVC6 PCI EtherNIC
OBJS = Top CLib CMOS Debug Interrupts SDIO Timers UART USB Video DMA Messaging GPIO VCHIQ IIC RTC SPI Touch KbdScan DBell IntVC6 PCI EtherNIC cachemaint memcpy
include HAL
......
......@@ -159,6 +159,11 @@ SDHCIWriteInterval # 4 ; minimum counter ticks between writes
SDHCILastWriteCount # 4 ; counter value at last write
SDHCIInputClock # 4 ; estimated speed of input clock to SDHCI block, kHz
SDHOSTClock # 4 ; SDHOST clock speed, kHz
SDHOSTDMABase # 4 ; base address of DMA channel reserved for SDHOST use
SDHOSTDMACBLog # 4 ; base address (logical) of DMA descriptors used for SDHOST
SDHOSTDMACBPhy # 4 ; base address (VPU physical) of DMA descriptors used for SDHOST
CleanRange # 4 ; -> CleanRange_ARM1176JZFS or CleanRange_CortexA53
InvalidateRange # 4 ; -> InvalidateRange_ARM1176JZFS or InvalidateRange_CortexA53
SDHOSTCRCError # 1 ; data CRC status bits to test
# (16-:INDEX:@):AND:15 ; align nicely
SDHOSTSpinLock # 8 ; protects SDHCFG register
......
This diff is collapsed.
......@@ -1166,7 +1166,7 @@ HAL_Null
HAL_InitDevices
Push "lr"
BL Video_InitDevices ; Must be before DMA_InitDevices
BL SDIO_InitDevices
BL SDIO_InitDevices ; Must be before DMA_InitDevices
BL DMA_InitDevices
BL GPIO_InitDevices
BL VCHIQ_InitDevices
......
;
; Copyright (c) 2021, RISC OS Developments Ltd
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
; * Neither the name of RISC OS Open Ltd nor the names of its contributors
; may be used to endorse or promote products derived from this software
; without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.
;
EXPORT CleanRange_ARM1176JZFS
EXPORT InvalidateRange_ARM1176JZFS
EXPORT CleanRange_CortexA53
EXPORT InvalidateRange_CortexA53
AREA |Asm$$Code|, CODE, READONLY
; Do a L1 data cache clean
; (Note this implementation assumes an ARM1176JZF-S and may not be applicable to other CPUs)
; On entry:
; a1 -> first address to clean (inclusive)
; a2 -> last address to clean (exclusive)
CleanRange_ARM1176JZFS ROUT
; Round outwards to next cacheline boundary
ADD a2, a2, #31
BIC a1, a1, #31
BIC a2, a2, #31
; Clean cachelines by MVA (logical address)
01 MCR p15, 0, a1, c7, c10, 1
ADD a1, a1, #32
CMP a1, a2
BNE %BT01
; Now ensure cache maintenance operations before this point complete
; before any memory accesses by any observer (most importantly, the DMA
; controller) triggered by any instruction occurring after this point in
; program order
MCR p15, 0, a1, c7, c10, 4 ; CP15_DSB, aka drain write buffer
BX lr
; Do a L1 data cache invalidate
; (Note this implementation assumes an ARM1176JZF-S and may not be applicable to other CPUs)
; On entry:
; a1 -> first address to invalidate (inclusive)
; a2 -> last address to invalidate (exclusive)
InvalidateRange_ARM1176JZFS ROUT
; Round outwards to next cacheline boundary
ADD a2, a2, #31
BIC a1, a1, #31
BIC a2, a2, #31
; Invalidate cachelines by MVA (logical address)
01 MCR p15, 0, a1, c7, c6, 1
ADD a1, a1, #32
CMP a1, a2
BNE %BT01
; The RISC OS kernel does a DSB here too, although I have been unable to
; locate official guidance recommending it. You'd think that in a
; situation where a pure invalidate (as opposed to clean & invalidate)
; was appropriate, no data targeted at the range in question should have
; been in the write buffer anyway...
MCR p15, 0, a1, c7, c10, 4 ; CP15_DSB, aka drain write buffer
BX lr
; Do a data cache clean to point of coherency of a range of addresses
; (Note this implementation assumes a Cortex-A53 and may not be applicable to other CPUs)
; On entry:
; a1 -> first address to clean (inclusive)
; a2 -> last address to clean (exclusive)
CleanRange_CortexA53 ROUT
; Round outwards to next cacheline boundary
ADD a2, a2, #63
BIC a1, a1, #63
BIC a2, a2, #63
; Clean cachelines by MVA (logical address)
01 MCR p15, 0, a1, c7, c10, 1 ; DCCMVAC
ADD a1, a1, #64
CMP a1, a2
BNE %BT01
; Now ensure cache maintenance operations before this point complete
; before any memory accesses by any observer (most importantly, the DMA
; controller) triggered by any instruction occurring after this point in
; program order
DSB
BX lr
; Do a data cache invalidate to point of coherency of a range of addresses
; (Note this implementation assumes a Cortex-A53 and may not be applicable to other CPUs)
; On entry:
; a1 -> first address to invalidate (inclusive)
; a2 -> last address to invalidate (exclusive)
InvalidateRange_CortexA53 ROUT
; Round outwards to next cacheline boundary
ADD a2, a2, #63
BIC a1, a1, #63
BIC a2, a2, #63
; Ensure that cache invalidation isn't observed until after reading the
; flag that indicated it was time to invalidate
DMB
; Invalidate cachelines by MVA (logical address)
01 MCR p15, 0, a1, c7, c6, 1 ; DCIMVAC
ADD a1, a1, #64
CMP a1, a2
BNE %BT01
; Examples in ARMv8 ARM K11.5.1 imply no need for a barrier afterwards
BX lr
END
;
; Copyright (c) 2013, Raspberry Pi Foundation
; Copyright (c) 2013, RISC OS Open Ltd
; Copyright (c) 2021, RISC OS Developments Ltd
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
; * Redistributions of source code must retain the above copyright
; notice, this list of conditions and the following disclaimer.
; * Redistributions in binary form must reproduce the above copyright
; notice, this list of conditions and the following disclaimer in the
; documentation and/or other materials provided with the distribution.
; * Neither the name of RISC OS Open Ltd nor the names of its contributors
; may be used to endorse or promote products derived from this software
; without specific prior written permission.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
; LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
; CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
; ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
; POSSIBILITY OF SUCH DAMAGE.
;
GET ListOpts
GET Macros
GET System
EXPORT FastMemcpy
AREA |Asm$$Code|, CODE, READONLY
ARM
MACRO
$offset Copy1Word$cond $r0, $r1
[ $offset=0
LDR$cond $r0, [a2], #4
|
LSR$cond $r0, $r1, #$offset*8
LDR$cond $r1, [a2, #4]!
ORR$cond $r0, $r1, LSL #32-$offset*8
]
STR$cond $r0, [a1], #4
MEND
MACRO
$offset Copy2Words$cond $r0, $r1, $r2
ASSERT $r0<$r1
[ $offset=0
LDM$cond a2!, {$r0,$r1}
|
LDR$cond $r1, [a2, #4]!
LSR$cond $r0, $r2, #$offset*8
LDR$cond $r2, [a2, #4]!
ORR$cond $r0, $r1, LSL #32-$offset*8
LSR$cond $r1, #$offset*8
ORR$cond $r1, $r2, LSL #32-$offset*8
]
STM$cond a1!, {$r0,$r1}
MEND
MACRO
$offset Copy4Words $r0, $r1, $r2, $r3, $r4
ASSERT $r0<$r1
ASSERT $r1<$r2
ASSERT $r2<$r3
ASSERT $r3<$r4
[ $offset=0
LDM a2!, {$r0,$r1,$r2,$r3}
|
LDMIB a2!, {$r1,$r2}
LSR $r0, $r4, #$offset*8
LDMIB a2!, {$r3,$r4}
ORR $r0, $r1, LSL #32-$offset*8
LSR $r1, #$offset*8
ORR $r1, $r2, LSL #32-$offset*8
LSR $r2, #$offset*8
ORR $r2, $r3, LSL #32-$offset*8
LSR $r3, #$offset*8
ORR $r3, $r4, LSL #32-$offset*8
]
STM a1!, {$r0,$r1,$r2,$r3}
MEND
MACRO
$offset Copy8Words $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $pld_dist
ASSERT $r0<$r1
ASSERT $r1<$r2
ASSERT $r2<$r3
ASSERT $r3<$r4
ASSERT $r4<$r5
ASSERT $r5<$r6
ASSERT $r6<$r7
ASSERT $r7<$r8
[ $offset=0
LDM a2!, {$r0,$r1,$r2,$r3,$r4,$r5,$r6,$r7}
[ "$pld_dist"<>""
PLD [a2, $pld_dist]
]
|
LDMIB a2!, {$r1,$r2,$r3,$r4}
LSR $r0, $r8, #$offset*8
LDMIB a2!, {$r5,$r6,$r7,$r8}
[ "$pld_dist"<>""
PLD [a2, $pld_dist]
]
ORR $r0, $r1, LSL #32-$offset*8
LSR $r1, #$offset*8
ORR $r1, $r2, LSL #32-$offset*8
LSR $r2, #$offset*8
ORR $r2, $r3, LSL #32-$offset*8
LSR $r3, #$offset*8
ORR $r3, $r4, LSL #32-$offset*8
LSR $r4, #$offset*8
ORR $r4, $r5, LSL #32-$offset*8
LSR $r5, #$offset*8
ORR $r5, $r6, LSL #32-$offset*8
LSR $r6, #$offset*8
ORR $r6, $r7, LSL #32-$offset*8
LSR $r7, #$offset*8
ORR $r7, $r8, LSL #32-$offset*8
]
STM a1!, {$r0,$r1,$r2,$r3}
STM a1!, {$r4,$r5,$r6,$r7}
MEND
MACRO
$offset Leading15Bytes
LSLS a4, ip, #31
SUB a3, ip
LDRBMI a4, [a2], #1
[ $offset:AND:1 = 0
LDRHCS v1, [a2], #2
|
LDRBCS v1, [a2], #1
LDRBCS v2, [a2], #1
ORRCS v1, v2, LSL #8
]
STRBMI a4, [a1], #1
STRHCS v1, [a1], #2
[ $offset>0
LDR lr, [a2, #-$offset]!
]
LSLS a4, ip, #29
$offset Copy1WordMI a4, lr
$offset Copy2WordsCS a4, v1, lr
MEND
MACRO
$offset Trailing15Bytes
LSLS a3, #29
$offset Copy2WordsCS a4, v1, lr
$offset Copy1WordMI a4, lr
[ $offset>0
ADD a2, #$offset
]
LSLS a3, #2
[ $offset:AND:1 = 0
LDRHCS a4, [a2], #2
|
LDRBCS a4, [a2], #1
LDRBCS v1, [a2], #1
ORRCS a4, v1, LSL #8
]
LDRBMI v1, [a2]
STRHCS a4, [a1], #2
STRBMI v1, [a1]
MEND
MACRO
$offset Large
$offset
$offset Leading15Bytes
AND ip, a2, #31 ; distance back to start of first (or only) cacheline loaded for this 32-byte block
RSB ip, #4*32 ; offset by which to PLD after we've LDMed each block
10
$offset Copy8Words a4, v1, v2, v3, v4, v5, v6, sl, lr, ip
SUBS a3, #32
BHS %BT10
; There are three possibilities for the remainder of the copy, depending
; on the sum of the present offset of a2 from its cacheline base and the
; number of trailing <= 31 bytes: either 0, 1 or 2 further preloads
AND a4, a2, #31
AND v1, a3, #31
ADD ip, #32
ADDS a4, v1
BEQ %FT20
PLD [a2, ip]
ADD ip, #32
CMP a4, #32
BLS %FT20
PLD [a2, ip]
20 ; Readjust loop termination condition to stop when less than 32 bytes left
ADD a3, #4*32
30
$offset Copy8Words a4, v1, v2, v3, v4, v5, v6, sl, lr
SUBS a3, #32
BHS %BT30
; Deal with trailing <= 31 bytes
LSLS a4, a3, #31-4
BEQ done_largeframe ; quick exit for common case of no trailing bytes
BPL %FT40
$offset Copy4Words a4, v1, v2, v3, lr
40
$offset Trailing15Bytes
POP {a1,v1-v6,sl,pc}
MEND
MACRO
$offset Small
$offset
$offset Leading15Bytes
10
$offset Copy4Words a4, v1, v2, ip, lr
SUBS a3, #16
BHS %BT10
TST a3, #15 ; unaffected by initial adjustment by 16
BEQ done_smallframe
$offset Trailing15Bytes
POP {a1,v1-v2,pc}
MEND
MACRO
$offset Tiny
$offset
[ $offset>0
LDR lr, [a2, #-$offset]!
]
TST a3, #16
BEQ %FT10
$offset Copy4Words a4, v1, v2, ip, lr
10
$offset Trailing15Bytes
POP {a1,v1-v2,pc}
MEND
; A memcpy stand-in that's reasonably well optimised for ARM1176JZF-S:
; * Writes should ideally be 16-byte-aligned 16-byte blocks.
; * Reads benefit from a PLD once every 32-byte cacheline, targeted within the
; first 8 bytes of a 32-byte cacheline, 4 cachelines ahead of the current
; read position.
; On entry:
; a1 -> destination
; a2 -> source
; a3 = number of bytes
; On exit
; a1 preserved
; a2-a4, ip corrupted
FastMemcpy ROUT
; Are we so small we (probably) can't even do a 16-byte aligned write?
CMP a3, #31
BLO do_tiny
; To enable preload-as-we-go, we require that the source buffer covers
; at least 5 consecutive complete cachelines (so we preload cacheline #5
; at the same time as loading cacheline #1). Lengths of 1 byte short of
; 6 cachelines are the shortest that are guaranteed to meet this
; requirement irrespective of alignment.
CMP a3, #6*32-1
BLO do_small
; Handle large copies
PUSH {a1,v1-v6,sl,lr}
SUB lr, a2, a1 ; find relative alignment
SUB a3, #32+4*32 ; simplify first loop termination test
; (we want it to stop when there are
; still 4 lots of 32 bytes to go)
BIC a4, a2, #31 ; -> base of first cacheline
; At least 4 cachelines must be preloaded upfront
GBLA o
o SETA 0
WHILE o < 4
PLD [a4, #o*32]
o SETA o + 1
WEND
; If the leading up-to-15 bytes (to acquire destination alignment)
; will push the source pointer into a new cacheline, one more is needed
ANDS ip, a1, #15
RSBNE ip, #16 ; number of leading bytes
ADD v1, a2, ip
AND v1, #31 ; index into cacheline when leading bytes done
CMP ip, v1 ; preload needed if leading bytes exceed that
BLS %F10
PLD [a4, #4*32]
10 LSLS lr, #31
BHI %FA3
BCS %FA2
BNE %FA1
0 Large
1 Large
2 Large
3 Large
done_largeframe
POP {a1,v1-v6,sl,pc}
do_small ; Handle copies that write half-cachelines without preload
PUSH {a1,v1-v2,lr}
SUB lr, a2, a1 ; find relative alignment
SUB a3, a3, #16 ; simplify loop termination test
ANDS ip, a1, #15
RSBNE ip, #16 ; number of leading bytes
LSLS lr, #31
BHI %FA3
BCS %FA2
BNE %FA1
0 Small
1 Small
2 Small
3 Small
do_tiny ; Handle copies so short that we can't write in half-cachelines
PUSH {a1,v1-v2,lr}
TST a1, #3
BEQ %F20
10 SUBS a3, a3, #1
BLO done_smallframe
LDRB a4, [a2], #1
STRB a4, [a1], #1
TST a1, #3
BNE %B10
20 TEQ a3, #0
BEQ done_smallframe
; Destination now at word alignment
LSLS lr, a2, #31
BHI %FA3
BCS %FA2
BNE %FA1
0 Tiny
1 Tiny
2 Tiny
3 Tiny
done_smallframe
POP {a1,v1-v2,pc}
END
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment