Optimise data cache clean/invalidate operation

The data cache clean and invalidate operations dcsw_op_all()
and dcsw_op_loius() were implemented to invoke a DSB and ISB
barrier for every set/way operation. This adds a substantial
performance penalty to an already expensive operation.

These functions have been reworked to provide an optimised
implementation derived from the code in section D3.4 of the
ARMv8 ARM. The helper macro setup_dcsw_op_args has been moved
and reworked alongside the implementation.

Fixes ARM-software/tf-issues#146

Change-Id: Icd5df57816a83f0a842fce935320a369f7465c7f
This commit is contained in:
Andrew Thoelke 2014-04-25 10:49:30 +01:00
parent e404d7f44a
commit 5f6032a820
2 changed files with 75 additions and 84 deletions

View File

@ -65,13 +65,6 @@
.endm
.macro setup_dcsw_op_args start_level, end_level, clidr, shift, fw, ls
mrs \clidr, clidr_el1
mov \start_level, xzr
ubfx \end_level, \clidr, \shift, \fw
lsl \end_level, \end_level, \ls
.endm
/*
* This macro verifies that the a given vector doesn't exceed the
* architectural limit of 32 instructions. This is meant to be placed

View File

@ -138,94 +138,92 @@ inv_loop:
ret
/* ------------------------------------------
* Data cache operations by set/way to the
* level specified
* ------------------------------------------
* ----------------------------------
* Call this func with the clidr in
* x0, starting cache level in x10,
* last cache level in x3 & cm op in
* x14
* ----------------------------------
/* ---------------------------------------------------------------
* Data cache operations by set/way to the level specified
*
* The main function, do_dcsw_op requires:
* x0: The operation type (0-2), as defined in arch.h
* x3: The last cache level to operate on
* x9: clidr_el1
* and will carry out the operation on each data cache from level 0
* to the level in x3 in sequence
*
* The dcsw_op macro sets up the x3 and x9 parameters based on
* clidr_el1 cache information before invoking the main function
* ---------------------------------------------------------------
*/
func dcsw_op
all_start_at_level:
add x2, x10, x10, lsr #1 // work out 3x current cache level
lsr x1, x0, x2 // extract cache type bits from clidr
and x1, x1, #7 // mask of the bits for current cache only
cmp x1, #2 // see what cache we have at this level
b.lt skip // skip if no cache, or just i-cache
msr csselr_el1, x10 // select current cache level in csselr
isb // isb to sych the new cssr&csidr
mrs x1, ccsidr_el1 // read the new ccsidr
and x2, x1, #7 // extract the length of the cache lines
add x2, x2, #4 // add 4 (line length offset)
mov x4, #0x3ff
and x4, x4, x1, lsr #3 // find maximum number on the way size
clz w5, w4 // find bit position of way size increment
mov x7, #0x7fff
and x7, x7, x1, lsr #13 // extract max number of the index size
loop2:
mov x9, x4 // create working copy of max way size
loop3:
lsl x6, x9, x5
orr x11, x10, x6 // factor way and cache number into x11
lsl x6, x7, x2
orr x11, x11, x6 // factor index number into x11
mov x12, x0
mov x13, x30 // lr
mov x0, x11
blr x14
mov x0, x12
mov x30, x13 // lr
subs x9, x9, #1 // decrement the way
b.ge loop3
subs x7, x7, #1 // decrement the index
b.ge loop2
skip:
add x10, x10, #2 // increment cache number
cmp x3, x10
b.gt all_start_at_level
finished:
mov x10, #0 // swith back to cache level 0
msr csselr_el1, x10 // select current cache level in csselr
dsb sy
isb
ret
.macro dcsw_op shift, fw, ls
mrs x9, clidr_el1
ubfx x3, x9, \shift, \fw
lsl x3, x3, \ls
b do_dcsw_op
.endm
func do_dcsw_op
cbz x3, exit
cmp x0, #DCISW
b.eq dc_isw
cmp x0, #DCCISW
b.eq dc_cisw
cmp x0, #DCCSW
b.eq dc_csw
dc_isw:
mov x10, xzr
adr x14, dcsw_loop_table // compute inner loop address
add x14, x14, x0, lsl #5 // inner loop is 8x32-bit instructions
mov x0, x9
adr x14, dcisw
b dcsw_op
dc_cisw:
mov x0, x9
adr x14, dccisw
b dcsw_op
dc_csw:
mov x0, x9
adr x14, dccsw
b dcsw_op
mov w8, #1
loop1:
add x2, x10, x10, lsr #1 // work out 3x current cache level
lsr x1, x0, x2 // extract cache type bits from clidr
and x1, x1, #7 // mask the bits for current cache only
cmp x1, #2 // see what cache we have at this level
b.lt level_done // nothing to do if no cache or icache
msr csselr_el1, x10 // select current cache level in csselr
isb // isb to sych the new cssr&csidr
mrs x1, ccsidr_el1 // read the new ccsidr
and x2, x1, #7 // extract the length of the cache lines
add x2, x2, #4 // add 4 (line length offset)
ubfx x4, x1, #3, #10 // maximum way number
clz w5, w4 // bit position of way size increment
lsl w9, w4, w5 // w9 = aligned max way number
lsl w16, w8, w5 // w16 = way number loop decrement
orr w9, w10, w9 // w9 = combine way and cache number
ubfx w6, w1, #13, #15 // w6 = max set number
lsl w17, w8, w2 // w17 = set number loop decrement
dsb sy // barrier before we start this level
br x14 // jump to DC operation specific loop
.macro dcsw_loop _op
loop2_\_op:
lsl w7, w6, w2 // w7 = aligned max set number
loop3_\_op:
orr w11, w9, w7 // combine cache, way and set number
dc \_op, x11
subs w7, w7, w17 // decrement set number
b.ge loop3_\_op
subs x9, x9, x16 // decrement way number
b.ge loop2_\_op
b level_done
.endm
level_done:
add x10, x10, #2 // increment cache number
cmp x3, x10
b.gt loop1
msr csselr_el1, xzr // select cache level 0 in csselr
dsb sy // barrier to complete final cache operation
isb
exit:
ret
dcsw_loop_table:
dcsw_loop isw
dcsw_loop cisw
dcsw_loop csw
func dcsw_op_louis
dsb sy
setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
b do_dcsw_op
dcsw_op #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
func dcsw_op_all
dsb sy
setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
b do_dcsw_op
dcsw_op #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT