From e7d344de01ad11b856233634717aafe9312697e4 Mon Sep 17 00:00:00 2001 From: Alexei Fedorov Date: Sun, 16 Aug 2020 16:01:13 +0100 Subject: [PATCH] libc/memset: Implement function in assembler Trace analysis of FVP_Base_AEMv8A model running in Aarch32 mode with the build options listed below: TRUSTED_BOARD_BOOT=1 GENERATE_COT=1 ARM_ROTPK_LOCATION=devel_ecdsa KEY_ALG=ecdsa ROT_KEY=plat/arm/board/common/rotpk/arm_rotprivk_ecdsa.pem shows that when auth_signature() gets called 71.84% of CPU execution time is spent in memset() function written in C using single byte write operations, see lib\libc\memset.c. This patch replaces C memset() implementation with assembler version giving the following results: - for Aarch32 in auth_signature() call memset() CPU time reduced to 24.84%. - Number of CPU instructions executed during TF-A boot stage before start of BL33 in RELEASE builds: ---------------------------------------------- | Arch | C | assembler | % | ---------------------------------------------- | Aarch32 | 2073275460 | 1487400003 | -28.25 | | Aarch64 | 2056807158 | 1244898303 | -39.47 | ---------------------------------------------- The patch also replaces memset.c with aarch64/memset.S in plat\nvidia\tegra\platform.mk. Change-Id: Ifbf085a2f577a25491e2d28446ee95a4ac891597 Signed-off-by: Alexei Fedorov --- lib/libc/aarch32/memset.S | 74 ++++++++++++++++++++++++++++++++ lib/libc/aarch64/memset.S | 79 +++++++++++++++++++++++++++++++++++ lib/libc/libc.mk | 9 +++- lib/libc/memset.c | 18 -------- plat/nvidia/tegra/platform.mk | 4 +- 5 files changed, 162 insertions(+), 22 deletions(-) create mode 100644 lib/libc/aarch32/memset.S create mode 100644 lib/libc/aarch64/memset.S delete mode 100644 lib/libc/memset.c diff --git a/lib/libc/aarch32/memset.S b/lib/libc/aarch32/memset.S new file mode 100644 index 000000000..0b69897fd --- /dev/null +++ b/lib/libc/aarch32/memset.S @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020, Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include + + .syntax unified + .global memset + +/* ----------------------------------------------------------------------- + * void memset(void *dst, int val, size_t count) + * + * Copy the value of 'val' (converted to an unsigned char) into + * each of the first 'count' characters of the object pointed to by 'dst'. + * + * Returns the value of 'dst'. + * ----------------------------------------------------------------------- + */ +func memset + cmp r2, #0 + bxeq lr /* return if 'count' = 0 */ + mov r12, r0 /* keep r0 */ + tst r0, #3 + beq aligned /* 4-bytes aligned */ + + /* Unaligned 'dst' */ +unaligned: + strb r1, [r12], #1 + subs r2, r2, #1 + bxeq lr /* return if 0 */ + tst r12, #3 + bne unaligned /* continue while unaligned */ + + /* 4-bytes aligned */ +aligned:bfi r1, r1, #8, #8 /* propagate 'val' */ + bfi r1, r1, #16, #16 + + mov r3, r1 + + cmp r2, #16 + blo less_16 + + push {r4, lr} + mov r4, r1 + mov lr, r1 + + cmp r2, #32 + blo less_32 + +write_32: + stmia r12!, {r1, r3, r4, lr} /* write 32 bytes in a loop */ + stmia r12!, {r1, r3, r4, lr} + subs r2, r2, #32 + popeq {r4, pc} /* return if 0 */ + cmp r2, #32 + bhs write_32 + +less_32:cmp r2, #16 + stmiahs r12!, {r1, r3, r4, lr} /* write 16 bytes */ + popeq {r4, pc} /* return if 16 */ + pop {r4, lr} + +less_16:lsls r2, r2, #29 /* C = r2[3]; N = r2[2]; Z = r2[2:0] */ + stmiacs r12!, {r1, r3} /* write 8 bytes */ + bxeq lr /* return if 8 */ + strmi r1, [r12], #4 /* write 4 bytes */ + lsls r2, r2, #1 /* N = r2[1]; Z = r2[0] */ + strhmi r1, [r12], #2 /* write 2 bytes */ + strbne r1, [r12] /* write 1 byte */ + bx lr + +endfunc memset diff --git a/lib/libc/aarch64/memset.S b/lib/libc/aarch64/memset.S new file mode 100644 index 000000000..8c65760ce --- /dev/null +++ b/lib/libc/aarch64/memset.S @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020, Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include + + .global memset + +/* ----------------------------------------------------------------------- + * void memset(void *dst, int val, size_t count) + * + * Copy the value of 'val' (converted to an unsigned char) into + * each of the first 'count' characters of the object pointed to by 'dst'. + * + * Returns the value of 'dst'. + * ----------------------------------------------------------------------- + */ +func memset + cbz w2, exit /* exit if 'count' = 0 */ + mov x3, x0 /* keep x0 */ + tst x0, #7 + b.eq aligned /* 8-bytes aligned */ + + /* Unaligned 'dst' */ +unaligned: + strb w1, [x3], #1 + subs w2, w2, #1 + b.eq exit /* exit if 0 */ + tst x3, #7 + b.ne unaligned /* continue while unaligned */ + + /* 8-bytes aligned */ +aligned:cbz x1, x1_zero + bfi w1, w1, #8, #8 /* propagate 'val' */ + bfi w1, w1, #16, #16 + bfi x1, x1, #32, #32 + +x1_zero:ands w4, w2, #~0x3f + b.eq less_64 + +write_64: + .rept 4 + stp x1, x1, [x3], #16 /* write 64 bytes in a loop */ + .endr + subs w4, w4, #64 + b.ne write_64 + ands w2, w2, #0x3f + b.eq exit /* exit if 0 */ + +less_64:tbz w2, #5, less_32 /* < 32 bytes */ + stp x1, x1, [x3], #16 /* write 32 bytes */ + stp x1, x1, [x3], #16 + ands w2, w2, #0x1f + b.eq exit + +less_32:tbz w2, #4, less_16 /* < 16 bytes */ + stp x1, x1, [x3], #16 /* write 16 bytes */ + ands w2, w2, #0xf + b.eq exit + +less_16:tbz w2, #3, less_8 /* < 8 bytes */ + str x1, [x3], #8 /* write 8 bytes */ + ands w2, w2, #7 + b.eq exit + +less_8: tbz w2, #2, less_4 /* < 4 bytes */ + str w1, [x3], #4 /* write 4 bytes */ + ands w2, w2, #3 + b.eq exit + +less_4: tbz w2, #1, less_2 /* < 2 bytes */ + strh w1, [x3], #2 /* write 2 bytes */ + tbz w2, #0, exit +less_2: strb w1, [x3] /* write 1 byte */ +exit: ret + +endfunc memset diff --git a/lib/libc/libc.mk b/lib/libc/libc.mk index 93d30d035..90a2a1e16 100644 --- a/lib/libc/libc.mk +++ b/lib/libc/libc.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2016-2019, ARM Limited and Contributors. All rights reserved. +# Copyright (c) 2016-2020, ARM Limited and Contributors. All rights reserved. # # SPDX-License-Identifier: BSD-3-Clause # @@ -13,7 +13,6 @@ LIBC_SRCS := $(addprefix lib/libc/, \ memcpy.c \ memmove.c \ memrchr.c \ - memset.c \ printf.c \ putchar.c \ puts.c \ @@ -28,8 +27,14 @@ LIBC_SRCS := $(addprefix lib/libc/, \ ifeq (${ARCH},aarch64) LIBC_SRCS += $(addprefix lib/libc/aarch64/, \ + memset.S \ setjmp.S) endif +ifeq (${ARCH},aarch32) +LIBC_SRCS += $(addprefix lib/libc/aarch32/, \ + memset.S) +endif + INCLUDES += -Iinclude/lib/libc \ -Iinclude/lib/libc/$(ARCH) \ diff --git a/lib/libc/memset.c b/lib/libc/memset.c deleted file mode 100644 index d8007d8e9..000000000 --- a/lib/libc/memset.c +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2013-2019, ARM Limited and Contributors. All rights reserved. - * - * SPDX-License-Identifier: BSD-3-Clause - */ - -#include -#include - -void *memset(void *dst, int val, size_t count) -{ - char *ptr = dst; - - while (count--) - *ptr++ = val; - - return dst; -} diff --git a/plat/nvidia/tegra/platform.mk b/plat/nvidia/tegra/platform.mk index a4724e64b..5cac46f64 100644 --- a/plat/nvidia/tegra/platform.mk +++ b/plat/nvidia/tegra/platform.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, ARM Limited and Contributors. All rights reserved. +# Copyright (c) 2015-2020, ARM Limited and Contributors. All rights reserved. # Copyright (c) 2020, NVIDIA Corporation. All rights reserved. # # SPDX-License-Identifier: BSD-3-Clause @@ -69,11 +69,11 @@ TF_CFLAGS += -Wsign-compare -nostdlib # override with necessary libc files for the Tegra platform override LIBC_SRCS := $(addprefix lib/libc/, \ + aarch64/memset.S \ aarch64/setjmp.S \ assert.c \ memcpy.c \ memmove.c \ - memset.c \ printf.c \ putchar.c \ strlen.c \