libc/memset: Implement function in assembler

Trace analysis of FVP_Base_AEMv8A model running in
Aarch32 mode with the build options listed below:
TRUSTED_BOARD_BOOT=1 GENERATE_COT=1
ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa
ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem
shows that when auth_signature() gets called
71.84% of CPU execution time is spent in memset() function
written in C using single byte write operations,
see lib\libc\memset.c.
This patch replaces C memset() implementation with assembler
version giving the following results:
- for Aarch32 in auth_signature() call memset() CPU time
reduced to 24.84%.
- Number of CPU instructions executed during TF-A
boot stage before start of BL33 in RELEASE builds:
----------------------------------------------
|  Arch   |     C      |  assembler |    %   |
----------------------------------------------
| Aarch32 | 2073275460 | 1487400003 | -28.25 |
| Aarch64 | 2056807158 | 1244898303 | -39.47 |
----------------------------------------------
The patch also replaces memset.c with aarch64/memset.S
in plat\nvidia\tegra\platform.mk.

Change-Id: Ifbf085a2f577a25491e2d28446ee95a4ac891597
Signed-off-by: Alexei Fedorov <Alexei.Fedorov@arm.com>
This commit is contained in:
Alexei Fedorov 2020-08-16 16:01:13 +01:00
parent e268ea271a
commit e7d344de01
5 changed files with 162 additions and 22 deletions

74
lib/libc/aarch32/memset.S Normal file
View File

@ -0,0 +1,74 @@
/*
* Copyright (c) 2020, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <asm_macros.S>
.syntax unified
.global memset
/* -----------------------------------------------------------------------
* void memset(void *dst, int val, size_t count)
*
* Copy the value of 'val' (converted to an unsigned char) into
* each of the first 'count' characters of the object pointed to by 'dst'.
*
* Returns the value of 'dst'.
* -----------------------------------------------------------------------
*/
func memset
cmp r2, #0
bxeq lr /* return if 'count' = 0 */
mov r12, r0 /* keep r0 */
tst r0, #3
beq aligned /* 4-bytes aligned */
/* Unaligned 'dst' */
unaligned:
strb r1, [r12], #1
subs r2, r2, #1
bxeq lr /* return if 0 */
tst r12, #3
bne unaligned /* continue while unaligned */
/* 4-bytes aligned */
aligned:bfi r1, r1, #8, #8 /* propagate 'val' */
bfi r1, r1, #16, #16
mov r3, r1
cmp r2, #16
blo less_16
push {r4, lr}
mov r4, r1
mov lr, r1
cmp r2, #32
blo less_32
write_32:
stmia r12!, {r1, r3, r4, lr} /* write 32 bytes in a loop */
stmia r12!, {r1, r3, r4, lr}
subs r2, r2, #32
popeq {r4, pc} /* return if 0 */
cmp r2, #32
bhs write_32
less_32:cmp r2, #16
stmiahs r12!, {r1, r3, r4, lr} /* write 16 bytes */
popeq {r4, pc} /* return if 16 */
pop {r4, lr}
less_16:lsls r2, r2, #29 /* C = r2[3]; N = r2[2]; Z = r2[2:0] */
stmiacs r12!, {r1, r3} /* write 8 bytes */
bxeq lr /* return if 8 */
strmi r1, [r12], #4 /* write 4 bytes */
lsls r2, r2, #1 /* N = r2[1]; Z = r2[0] */
strhmi r1, [r12], #2 /* write 2 bytes */
strbne r1, [r12] /* write 1 byte */
bx lr
endfunc memset

79
lib/libc/aarch64/memset.S Normal file
View File

@ -0,0 +1,79 @@
/*
* Copyright (c) 2020, Arm Limited. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <asm_macros.S>
.global memset
/* -----------------------------------------------------------------------
* void memset(void *dst, int val, size_t count)
*
* Copy the value of 'val' (converted to an unsigned char) into
* each of the first 'count' characters of the object pointed to by 'dst'.
*
* Returns the value of 'dst'.
* -----------------------------------------------------------------------
*/
func memset
cbz w2, exit /* exit if 'count' = 0 */
mov x3, x0 /* keep x0 */
tst x0, #7
b.eq aligned /* 8-bytes aligned */
/* Unaligned 'dst' */
unaligned:
strb w1, [x3], #1
subs w2, w2, #1
b.eq exit /* exit if 0 */
tst x3, #7
b.ne unaligned /* continue while unaligned */
/* 8-bytes aligned */
aligned:cbz x1, x1_zero
bfi w1, w1, #8, #8 /* propagate 'val' */
bfi w1, w1, #16, #16
bfi x1, x1, #32, #32
x1_zero:ands w4, w2, #~0x3f
b.eq less_64
write_64:
.rept 4
stp x1, x1, [x3], #16 /* write 64 bytes in a loop */
.endr
subs w4, w4, #64
b.ne write_64
ands w2, w2, #0x3f
b.eq exit /* exit if 0 */
less_64:tbz w2, #5, less_32 /* < 32 bytes */
stp x1, x1, [x3], #16 /* write 32 bytes */
stp x1, x1, [x3], #16
ands w2, w2, #0x1f
b.eq exit
less_32:tbz w2, #4, less_16 /* < 16 bytes */
stp x1, x1, [x3], #16 /* write 16 bytes */
ands w2, w2, #0xf
b.eq exit
less_16:tbz w2, #3, less_8 /* < 8 bytes */
str x1, [x3], #8 /* write 8 bytes */
ands w2, w2, #7
b.eq exit
less_8: tbz w2, #2, less_4 /* < 4 bytes */
str w1, [x3], #4 /* write 4 bytes */
ands w2, w2, #3
b.eq exit
less_4: tbz w2, #1, less_2 /* < 2 bytes */
strh w1, [x3], #2 /* write 2 bytes */
tbz w2, #0, exit
less_2: strb w1, [x3] /* write 1 byte */
exit: ret
endfunc memset

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2016-2019, ARM Limited and Contributors. All rights reserved.
# Copyright (c) 2016-2020, ARM Limited and Contributors. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#
@ -13,7 +13,6 @@ LIBC_SRCS := $(addprefix lib/libc/, \
memcpy.c \
memmove.c \
memrchr.c \
memset.c \
printf.c \
putchar.c \
puts.c \
@ -28,8 +27,14 @@ LIBC_SRCS := $(addprefix lib/libc/, \
ifeq (${ARCH},aarch64)
LIBC_SRCS += $(addprefix lib/libc/aarch64/, \
memset.S \
setjmp.S)
endif
ifeq (${ARCH},aarch32)
LIBC_SRCS += $(addprefix lib/libc/aarch32/, \
memset.S)
endif
INCLUDES += -Iinclude/lib/libc \
-Iinclude/lib/libc/$(ARCH) \

View File

@ -1,18 +0,0 @@
/*
* Copyright (c) 2013-2019, ARM Limited and Contributors. All rights reserved.
*
* SPDX-License-Identifier: BSD-3-Clause
*/
#include <stddef.h>
#include <string.h>
void *memset(void *dst, int val, size_t count)
{
char *ptr = dst;
while (count--)
*ptr++ = val;
return dst;
}

View File

@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2019, ARM Limited and Contributors. All rights reserved.
# Copyright (c) 2015-2020, ARM Limited and Contributors. All rights reserved.
# Copyright (c) 2020, NVIDIA Corporation. All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
@ -69,11 +69,11 @@ TF_CFLAGS += -Wsign-compare -nostdlib
# override with necessary libc files for the Tegra platform
override LIBC_SRCS := $(addprefix lib/libc/, \
aarch64/memset.S \
aarch64/setjmp.S \
assert.c \
memcpy.c \
memmove.c \
memset.c \
printf.c \
putchar.c \
strlen.c \