Optimise data cache clean/invalidate operation

The data cache clean and invalidate operations dcsw_op_all() and dcsw_op_loius() were implemented to invoke a DSB and ISB barrier for every set/way operation. This adds a substantial performance penalty to an already expensive operation. These functions have been reworked to provide an optimised implementation derived from the code in section D3.4 of the ARMv8 ARM. The helper macro setup_dcsw_op_args has been moved and reworked alongside the implementation. Fixes ARM-software/tf-issues#146 Change-Id: Icd5df57816a83f0a842fce935320a369f7465c7f
2014-04-25 10:49:30 +01:00 · 2014-04-25 10:49:30 +01:00 · 5f6032a820
parent e404d7f44a
commit 5f6032a820
2 changed files with 75 additions and 84 deletions
--- a/include/common/asm_macros.S
+++ b/include/common/asm_macros.S
@ -65,13 +65,6 @@
 	.endm


-	.macro	setup_dcsw_op_args  start_level, end_level, clidr, shift, fw, ls
-	mrs	\clidr, clidr_el1
-	mov	\start_level, xzr
-	ubfx	\end_level, \clidr, \shift, \fw
-	lsl	\end_level, \end_level, \ls
-	.endm
-
 	/*
 	 * This macro verifies that the a given vector doesn't exceed the
 	 * architectural limit of 32 instructions. This is meant to be placed
--- a/lib/aarch64/cache_helpers.S
+++ b/lib/aarch64/cache_helpers.S
@ -138,94 +138,92 @@ inv_loop:
 	ret


-	/* ------------------------------------------
-	 * Data cache operations by set/way to the
-	 * level specified
-	 * ------------------------------------------
-	 * ----------------------------------
-	 * Call this func with the clidr in
-	 * x0, starting cache level in x10,
-	 * last cache level in x3 & cm op in
-	 * x14
-	 * ----------------------------------
+	/* ---------------------------------------------------------------
+	 * Data cache operations by set/way to the level specified
+	 *
+	 * The main function, do_dcsw_op requires:
+	 * x0: The operation type (0-2), as defined in arch.h
+	 * x3: The last cache level to operate on
+	 * x9: clidr_el1
+	 * and will carry out the operation on each data cache from level 0
+	 * to the level in x3 in sequence
+	 *
+	 * The dcsw_op macro sets up the x3 and x9 parameters based on
+	 * clidr_el1 cache information before invoking the main function
+	 * ---------------------------------------------------------------
 	 */
-func dcsw_op
-all_start_at_level:
-	add	x2, x10, x10, lsr #1            // work out 3x current cache level
-	lsr	x1, x0, x2                      // extract cache type bits from clidr
-	and	x1, x1, #7                      // mask of the bits for current cache only
-	cmp	x1, #2                          // see what cache we have at this level
-	b.lt	skip                            // skip if no cache, or just i-cache
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	isb                                     // isb to sych the new cssr&csidr
-	mrs	x1, ccsidr_el1                  // read the new ccsidr
-	and	x2, x1, #7                      // extract the length of the cache lines
-	add	x2, x2, #4                      // add 4 (line length offset)
-	mov	x4, #0x3ff
-	and	x4, x4, x1, lsr #3              // find maximum number on the way size
-	clz	w5, w4                          // find bit position of way size increment
-	mov	x7, #0x7fff
-	and	x7, x7, x1, lsr #13             // extract max number of the index size
-loop2:
-	mov	x9, x4                          // create working copy of max way size
-loop3:
-	lsl	x6, x9, x5
-	orr	x11, x10, x6                    // factor way and cache number into x11
-	lsl	x6, x7, x2
-	orr	x11, x11, x6                    // factor index number into x11
-	mov	x12, x0
-	mov	x13, x30 // lr
-	mov	x0, x11
-	blr	x14
-	mov	x0, x12
-	mov	x30, x13 // lr
-	subs	x9, x9, #1                      // decrement the way
-	b.ge    loop3
-	subs	x7, x7, #1                      // decrement the index
-	b.ge    loop2
-skip:
-	add	x10, x10, #2                    // increment cache number
-	cmp	x3, x10
-	b.gt    all_start_at_level
-finished:
-	mov	x10, #0                         // swith back to cache level 0
-	msr	csselr_el1, x10                 // select current cache level in csselr
-	dsb	sy
-	isb
-	ret

+	.macro	dcsw_op shift, fw, ls
+	mrs	x9, clidr_el1
+	ubfx	x3, x9, \shift, \fw
+	lsl	x3, x3, \ls
+	b	do_dcsw_op
+	.endm

 func do_dcsw_op
 	cbz	x3, exit
-	cmp	x0, #DCISW
-	b.eq	dc_isw
-	cmp	x0, #DCCISW
-	b.eq	dc_cisw
-	cmp	x0, #DCCSW
-	b.eq	dc_csw
-dc_isw:
+	mov	x10, xzr
+	adr	x14, dcsw_loop_table	// compute inner loop address
+	add	x14, x14, x0, lsl #5	// inner loop is 8x32-bit instructions
 	mov	x0, x9
-	adr	x14, dcisw
-	b	dcsw_op
-dc_cisw:
-	mov	x0, x9
-	adr	x14, dccisw
-	b	dcsw_op
-dc_csw:
-	mov	x0, x9
-	adr	x14, dccsw
-	b	dcsw_op
+	mov	w8, #1
+loop1:
+	add	x2, x10, x10, lsr #1	// work out 3x current cache level
+	lsr	x1, x0, x2		// extract cache type bits from clidr
+	and	x1, x1, #7		// mask the bits for current cache only
+	cmp	x1, #2			// see what cache we have at this level
+	b.lt	level_done		// nothing to do if no cache or icache
+
+	msr	csselr_el1, x10		// select current cache level in csselr
+	isb				// isb to sych the new cssr&csidr
+	mrs	x1, ccsidr_el1		// read the new ccsidr
+	and	x2, x1, #7		// extract the length of the cache lines
+	add	x2, x2, #4		// add 4 (line length offset)
+	ubfx	x4, x1, #3, #10		// maximum way number
+	clz	w5, w4			// bit position of way size increment
+	lsl	w9, w4, w5		// w9 = aligned max way number
+	lsl	w16, w8, w5		// w16 = way number loop decrement
+	orr	w9, w10, w9		// w9 = combine way and cache number
+	ubfx	w6, w1, #13, #15	// w6 = max set number
+	lsl	w17, w8, w2		// w17 = set number loop decrement
+	dsb	sy			// barrier before we start this level
+	br	x14			// jump to DC operation specific loop
+
+	.macro	dcsw_loop _op
+loop2_\_op:
+	lsl	w7, w6, w2		// w7 = aligned max set number
+
+loop3_\_op:
+	orr	w11, w9, w7		// combine cache, way and set number
+	dc	\_op, x11
+	subs	w7, w7, w17		// decrement set number
+	b.ge	loop3_\_op
+
+	subs	x9, x9, x16		// decrement way number
+	b.ge	loop2_\_op
+
+	b	level_done
+	.endm
+
+level_done:
+	add	x10, x10, #2		// increment cache number
+	cmp	x3, x10
+	b.gt    loop1
+	msr	csselr_el1, xzr		// select cache level 0 in csselr
+	dsb	sy			// barrier to complete final cache operation
+	isb
 exit:
 	ret

+dcsw_loop_table:
+	dcsw_loop isw
+	dcsw_loop cisw
+	dcsw_loop csw
+

 func dcsw_op_louis
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOUIS_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT


 func dcsw_op_all
-	dsb	sy
-	setup_dcsw_op_args x10, x3, x9, #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT
-	b	do_dcsw_op
+	dcsw_op #LOC_SHIFT, #CLIDR_FIELD_WIDTH, #LEVEL_SHIFT