From 8ca61538a0fe3aed6764a012317cbf61f09ebb61 Mon Sep 17 00:00:00 2001 From: David Pu Date: Mon, 18 Mar 2019 15:14:49 -0700 Subject: [PATCH 1/4] Tegra194: add RAS exception handling This patch adds all Tegra194 RAS nodes definitions and support to handle all uncorrectable RAS errors. Change-Id: I109b5a8dbca91d92752dc282c4ca30f273c475f9 Signed-off-by: David Pu Signed-off-by: Varun Wadekar --- include/lib/extensions/ras.h | 2 + include/lib/extensions/ras_arch.h | 4 + lib/extensions/ras/ras_common.c | 42 +++ plat/nvidia/tegra/include/platform_def.h | 1 + .../tegra/include/t194/tegra194_ras_private.h | 260 ++++++++++++++++ plat/nvidia/tegra/include/tegra_private.h | 6 +- plat/nvidia/tegra/soc/t194/plat_ras.c | 287 ++++++++++++++++++ plat/nvidia/tegra/soc/t194/plat_setup.c | 5 + plat/nvidia/tegra/soc/t194/platform_t194.mk | 11 + 9 files changed, 617 insertions(+), 1 deletion(-) create mode 100644 plat/nvidia/tegra/include/t194/tegra194_ras_private.h create mode 100644 plat/nvidia/tegra/soc/t194/plat_ras.c diff --git a/include/lib/extensions/ras.h b/include/lib/extensions/ras.h index 4fc8f04b1..793ab9fac 100644 --- a/include/lib/extensions/ras.h +++ b/include/lib/extensions/ras.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -192,6 +193,7 @@ static inline int ras_err_ser_probe_sysreg(const struct err_record_info *info, probe_data); } +const char *ras_serr_to_str(unsigned int serr); int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags); void ras_init(void); diff --git a/include/lib/extensions/ras_arch.h b/include/lib/extensions/ras_arch.h index 0c98c4a0e..55760b06b 100644 --- a/include/lib/extensions/ras_arch.h +++ b/include/lib/extensions/ras_arch.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2018, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -151,6 +152,9 @@ #define ERROR_STATUS_SET_UC 0x2 /* Uncontainable */ #define ERROR_STATUS_SET_CE 0x3 /* Corrected */ +/* Number of architecturally-defined primary error codes */ +#define ERROR_STATUS_NUM_SERR U(22) + /* Implementation Defined Syndrome bit in ESR */ #define SERROR_IDS_BIT U(24) diff --git a/lib/extensions/ras/ras_common.c b/lib/extensions/ras/ras_common.c index 64a48524b..36f9a95b6 100644 --- a/lib/extensions/ras/ras_common.c +++ b/lib/extensions/ras/ras_common.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2018-2019, ARM Limited and Contributors. All rights reserved. + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. * * SPDX-License-Identifier: BSD-3-Clause */ @@ -18,6 +19,47 @@ # error Platform must define RAS priority value #endif +/* + * Function to convert architecturally-defined primary error code SERR, + * bits[7:0] from ERRSTATUS to its corresponding error string. + */ +const char *ras_serr_to_str(unsigned int serr) +{ + const char *str[ERROR_STATUS_NUM_SERR] = { + "No error", + "IMPLEMENTATION DEFINED error", + "Data value from (non-associative) internal memory", + "IMPLEMENTATION DEFINED pin", + "Assertion failure", + "Error detected on internal data path", + "Data value from associative memory", + "Address/control value from associative memory", + "Data value from a TLB", + "Address/control value from a TLB", + "Data value from producer", + "Address/control value from producer", + "Data value from (non-associative) external memory", + "Illegal address (software fault)", + "Illegal access (software fault)", + "Illegal state (software fault)", + "Internal data register", + "Internal control register", + "Error response from slave", + "External timeout", + "Internal timeout", + "Deferred error from slave not supported at master" + }; + + /* + * All other values are reserved. Reserved values might be defined + * in a future version of the architecture + */ + if (serr >= ERROR_STATUS_NUM_SERR) + return "unknown SERR"; + + return str[serr]; +} + /* Handler that receives External Aborts on RAS-capable systems */ int ras_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, void *handle, uint64_t flags) diff --git a/plat/nvidia/tegra/include/platform_def.h b/plat/nvidia/tegra/include/platform_def.h index 678b15c14..2331869d2 100644 --- a/plat/nvidia/tegra/include/platform_def.h +++ b/plat/nvidia/tegra/include/platform_def.h @@ -95,6 +95,7 @@ * Platform macros to support exception handling framework ******************************************************************************/ #define PLAT_PRI_BITS U(3) +#define PLAT_RAS_PRI U(0x10) #define PLAT_SDEI_CRITICAL_PRI U(0x20) #define PLAT_SDEI_NORMAL_PRI U(0x30) #define PLAT_TEGRA_WDT_PRIO U(0x40) diff --git a/plat/nvidia/tegra/include/t194/tegra194_ras_private.h b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h new file mode 100644 index 000000000..c867b9d2b --- /dev/null +++ b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef TEGRA194_RAS_PRIVATE +#define TEGRA194_RAS_PRIVATE + +#include + +/* Implementation defined RAS error and corresponding error message */ +struct ras_error { + const char *error_msg; + /* IERR(bits[15:8]) from ERRSTATUS */ + uint8_t error_code; +}; + +/* RAS error node-specific auxiliary data */ +struct ras_aux_data { + /* point to null-terminated ras_error array to convert error code to msg. */ + const struct ras_error *error_records; + /* + * function to return an value which needs to be programmed into ERXCTLR_EL1 + * to enable all specified RAS errors for current node. + */ + uint64_t (*err_ctrl)(void); +}; + +/* IFU Uncorrectable RAS ERROR */ +#define IFU_UNCORR_RAS_ERROR_LIST(X) + +/* JSR_RET Uncorrectable RAS ERROR */ +#define JSR_RET_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_RET, 35, 0x13, "Floating Point Register File Parity Error") \ + X(JSR_RET, 34, 0x12, "Integer Register File Parity Error") \ + X(JSR_RET, 33, 0x11, "Garbage Bundle") \ + X(JSR_RET, 32, 0x10, "Bundle Completion Timeout") + +/* JSR_MTS Uncorrectable RAS ERROR */ +#define JSR_MTS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(JSR_MTS, 40, 0x28, "CoreSight Access Error") \ + X(JSR_MTS, 39, 0x27, "Dual Execution Uncorrectable Error") \ + X(JSR_MTS, 37, 0x25, "CTU MMIO Region") \ + X(JSR_MTS, 36, 0x24, "MTS MMCRAB Region Access") \ + X(JSR_MTS, 35, 0x23, "MTS_CARVEOUT Access from ARM SW") \ + X(JSR_MTS, 34, 0x22, "NAFLL PLL Failure to Lock") \ + X(JSR_MTS, 32, 0x20, "Internal Uncorrectable MTS Error") + +/* LSD_STQ Uncorrectable RAS ERROR */ +#define LSD_STQ_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_STQ, 41, 0x39, "Coherent Cache Data Store Multi-Line ECC Error") \ + X(LSD_STQ, 40, 0x38, "Coherent Cache Data Store Uncorrectable ECC Error") \ + X(LSD_STQ, 38, 0x36, "Coherent Cache Data Load Uncorrectable ECC Error") \ + X(LSD_STQ, 33, 0x31, "Coherent Cache Tag Store Parity Error") \ + X(LSD_STQ, 32, 0x30, "Coherent Cache Tag Load Parity Error") + +/* LSD_DCC Uncorrectable RAS ERROR */ +#define LSD_DCC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(LSD_DCC, 41, 0x49, "BTU Copy Mini-Cache PPN Multi-Hit Error") \ + X(LSD_DCC, 39, 0x47, "Coherent Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 37, 0x45, "Version Cache Byte-Enable Parity Error") \ + X(LSD_DCC, 36, 0x44, "Version Cache Data Uncorrectable ECC Error") \ + X(LSD_DCC, 33, 0x41, "BTU Copy Coherent Cache PPN Parity Error") \ + X(LSD_DCC, 32, 0x40, "BTU Copy Coherent Cache VPN Parity Error") + +/* LSD_L1HPF Uncorrectable RAS ERROR */ +#define LSD_L1HPF_UNCORR_RAS_ERROR_LIST(X) + +/* L2 Uncorrectable RAS ERROR */ +#define L2_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L2, 56, 0x68, "URT Timeout") \ + X(L2, 55, 0x67, "L2 Protocol Violation") \ + X(L2, 54, 0x66, "SCF to L2 Slave Error Read") \ + X(L2, 53, 0x65, "SCF to L2 Slave Error Write") \ + X(L2, 52, 0x64, "SCF to L2 Decode Error Read") \ + X(L2, 51, 0x63, "SCF to L2 Decode Error Write") \ + X(L2, 50, 0x62, "SCF to L2 Request Response Interface Parity Errors") \ + X(L2, 49, 0x61, "SCF to L2 Advance notice interface parity errors") \ + X(L2, 48, 0x60, "SCF to L2 Filldata Parity Errors") \ + X(L2, 47, 0x5F, "SCF to L2 UnCorrectable ECC Data Error on interface") \ + X(L2, 45, 0x5D, "Core 1 to L2 Parity Error") \ + X(L2, 44, 0x5C, "Core 0 to L2 Parity Error") \ + X(L2, 43, 0x5B, "L2 Multi-Hit") \ + X(L2, 42, 0x5A, "L2 URT Tag Parity Error") \ + X(L2, 41, 0x59, "L2 NTT Tag Parity Error") \ + X(L2, 40, 0x58, "L2 MLT Tag Parity Error") \ + X(L2, 39, 0x57, "L2 URD Data") \ + X(L2, 38, 0x56, "L2 NTP Data") \ + X(L2, 36, 0x54, "L2 MLC Uncorrectable Clean") \ + X(L2, 35, 0x53, "L2 URD Uncorrectable Dirty") \ + X(L2, 34, 0x52, "L2 MLC Uncorrectable Dirty") + +/* CLUSTER_CLOCKS Uncorrectable RAS ERROR */ +#define CLUSTER_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CLUSTER_CLOCKS, 32, 0xE4, "Frequency Monitor Error") + +/* MMU Uncorrectable RAS ERROR */ +#define MMU_UNCORR_RAS_ERROR_LIST(X) + +/* L3 Uncorrectable RAS ERROR */ +#define L3_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(L3, 43, 0x7B, "SNOC Interface Parity Error") \ + X(L3, 42, 0x7A, "MCF Interface Parity Error") \ + X(L3, 41, 0x79, "L3 Tag Parity Error") \ + X(L3, 40, 0x78, "L3 Dir Parity Error") \ + X(L3, 39, 0x77, "L3 Uncorrectable ECC Error") \ + X(L3, 37, 0x75, "Multi-Hit CAM Error") \ + X(L3, 36, 0x74, "Multi-Hit Tag Error") \ + X(L3, 35, 0x73, "Unrecognized Command Error") \ + X(L3, 34, 0x72, "L3 Protocol Error") + +/* CCPMU Uncorrectable RAS ERROR */ +#define CCPMU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CCPMU, 40, 0x87, "CoreSight Access Error") \ + X(CCPMU, 36, 0x84, "MCE Ucode Error") \ + X(CCPMU, 35, 0x83, "MCE IL1 Parity Error") \ + X(CCPMU, 34, 0x82, "MCE Timeout Error") \ + X(CCPMU, 33, 0x81, "CRAB Access Error") \ + X(CCPMU, 32, 0x80, "MCE Memory Access Error") + +/* SCF_IOB Uncorrectable RAS ERROR */ +#define SCF_IOB_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_IOB, 41, 0x99, "Request parity error") \ + X(SCF_IOB, 40, 0x98, "Putdata parity error") \ + X(SCF_IOB, 39, 0x97, "Uncorrectable ECC on Putdata") \ + X(SCF_IOB, 38, 0x96, "CBB Interface Error") \ + X(SCF_IOB, 37, 0x95, "MMCRAB Error") \ + X(SCF_IOB, 36, 0x94, "IHI Interface Error") \ + X(SCF_IOB, 35, 0x93, "CRI Error") \ + X(SCF_IOB, 34, 0x92, "TBX Interface Error") \ + X(SCF_IOB, 33, 0x91, "EVP Interface Error") + +/* SCF_SNOC Uncorrectable RAS ERROR */ +#define SCF_SNOC_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_SNOC, 42, 0xAA, "Misc Client Parity Error") \ + X(SCF_SNOC, 41, 0xA9, "Misc Filldata Parity Error") \ + X(SCF_SNOC, 40, 0xA8, "Uncorrectable ECC Misc Client") \ + X(SCF_SNOC, 39, 0xA7, "DVMU Interface Parity Error") \ + X(SCF_SNOC, 38, 0xA6, "DVMU Interface Timeout Error") \ + X(SCF_SNOC, 37, 0xA5, "CPE Request Error") \ + X(SCF_SNOC, 36, 0xA4, "CPE Response Error") \ + X(SCF_SNOC, 35, 0xA3, "CPE Timeout Error") \ + X(SCF_SNOC, 34, 0xA2, "Uncorrectable Carveout Error") + +/* SCF_CTU Uncorrectable RAS ERROR */ +#define SCF_CTU_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(SCF_CTU, 39, 0xB7, "Timeout error for TRC_DMA request") \ + X(SCF_CTU, 38, 0xB6, "Timeout error for CTU Snp") \ + X(SCF_CTU, 37, 0xB5, "Parity error in CTU TAG RAM") \ + X(SCF_CTU, 36, 0xB3, "Parity error in CTU DATA RAM") \ + X(SCF_CTU, 35, 0xB4, "Parity error for Cluster Rsp") \ + X(SCF_CTU, 34, 0xB2, "Parity error for TRL requests from 9 agents") \ + X(SCF_CTU, 33, 0xB1, "Parity error for MCF request") \ + X(SCF_CTU, 32, 0xB0, "TRC DMA fillsnoop parity error") + +/* CMU_CLOCKS Uncorrectable RAS ERROR */ +#define CMU_CLOCKS_UNCORR_RAS_ERROR_LIST(X) \ + /* Name, ERR_CTRL, IERR, ISA Desc */ \ + X(CMU_CLOCKS, 39, 0xC7, "Cluster 3 frequency monitor error") \ + X(CMU_CLOCKS, 38, 0xC6, "Cluster 2 frequency monitor error") \ + X(CMU_CLOCKS, 37, 0xC5, "Cluster 1 frequency monitor error") \ + X(CMU_CLOCKS, 36, 0xC3, "Cluster 0 frequency monitor error") \ + X(CMU_CLOCKS, 35, 0xC4, "Voltage error on ADC1 Monitored Logic") \ + X(CMU_CLOCKS, 34, 0xC2, "Voltage error on ADC0 Monitored Logic") \ + X(CMU_CLOCKS, 33, 0xC1, "Lookup Table 1 Parity Error") \ + X(CMU_CLOCKS, 32, 0xC0, "Lookup Table 0 Parity Error") + +/* + * Define one ras_error entry. + * + * This macro wille be used to to generate ras_error records for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ONE_RAS_ERROR_MSG(unit, ras_bit, ierr, msg) \ + { \ + .error_msg = (msg), \ + .error_code = (ierr) \ + }, + +/* + * Set one implementation defined bit in ERRCTLR + * + * This macro will be used to collect all defined ERR_CTRL bits for each node + * defined by _UNCORR_RAS_ERROR_LIST macro. + */ +#define DEFINE_ENABLE_RAS_BIT(unit, ras_bit, ierr, msg) \ + do { \ + val |= (1ULL << ras_bit##U); \ + } while (0); + +/* Represent one RAS node with 0 or more error bits (ERR_CTLR) enabled */ +#define DEFINE_ONE_RAS_NODE(node) \ +static const struct ras_error node##_uncorr_ras_errors[] = { \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ONE_RAS_ERROR_MSG) \ + { \ + NULL, \ + 0U \ + }, \ +}; \ +static inline uint64_t node##_err_ctrl(void) \ +{ \ + uint64_t val = 0ULL; \ + node##_UNCORR_RAS_ERROR_LIST(DEFINE_ENABLE_RAS_BIT) \ + return val; \ +} + +#define DEFINE_ONE_RAS_AUX_DATA(node) \ + { \ + .error_records = node##_uncorr_ras_errors, \ + .err_ctrl = &node##_err_ctrl \ + }, + +#define PER_CORE_RAS_NODE_LIST(X) \ + X(IFU) \ + X(JSR_RET) \ + X(JSR_MTS) \ + X(LSD_STQ) \ + X(LSD_DCC) \ + X(LSD_L1HPF) + +#define PER_CORE_RAS_GROUP_NODES PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define PER_CLUSTER_RAS_NODE_LIST(X) \ + X(L2) \ + X(CLUSTER_CLOCKS) \ + X(MMU) + +#define PER_CLUSTER_RAS_GROUP_NODES PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define SCF_L3_BANK_RAS_NODE_LIST(X) X(L3) + +/* we have 4 SCF_L3 nodes:3*256 + L3_Bank_ID(0-3) */ +#define SCF_L3_BANK_RAS_GROUP_NODES \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) \ + SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#define CCPLEX_RAS_NODE_LIST(X) \ + X(CCPMU) \ + X(SCF_IOB) \ + X(SCF_SNOC) \ + X(SCF_CTU) \ + X(CMU_CLOCKS) + +#define CCPLEX_RAS_GROUP_NODES CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_AUX_DATA) + +#endif /* TEGRA194_RAS_PRIVATE */ diff --git a/plat/nvidia/tegra/include/tegra_private.h b/plat/nvidia/tegra/include/tegra_private.h index f72c9cf3c..a6d8e6886 100644 --- a/plat/nvidia/tegra/include/tegra_private.h +++ b/plat/nvidia/tegra/include/tegra_private.h @@ -89,7 +89,7 @@ int32_t plat_lock_cpu_vectors(void); /* Declarations for tegra_fiq_glue.c */ void tegra_fiq_handler_setup(void); -int tegra_fiq_get_intr_context(void); +int32_t tegra_fiq_get_intr_context(void); void tegra_fiq_set_ns_entrypoint(uint64_t entrypoint); /* Declarations for tegra_security.c */ @@ -157,4 +157,8 @@ int plat_sip_handler(uint32_t smc_fid, void *handle, uint64_t flags); +#if RAS_EXTENSION +void tegra194_ras_enable(void); +#endif + #endif /* TEGRA_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/plat_ras.c b/plat/nvidia/tegra/soc/t194/plat_ras.c new file mode 100644 index 000000000..f9ebb37e4 --- /dev/null +++ b/plat/nvidia/tegra/soc/t194/plat_ras.c @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2020, NVIDIA Corporation. All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* + * ERRFR bits[63:32], it indicates supported RAS errors which can be enabled + * by setting corresponding bits in ERRCTLR + */ +#define ERR_FR_EN_BITS_MASK 0xFFFFFFFF00000000ULL + +/* bakery lock for platform RAS handler. */ +static DEFINE_BAKERY_LOCK(ras_handler_lock); +#define ras_lock() bakery_lock_get(&ras_handler_lock) +#define ras_unlock() bakery_lock_release(&ras_handler_lock) + +/* + * Function to handle an External Abort received at EL3. + * This function is invoked by RAS framework. + */ +static void tegra194_ea_handler(unsigned int ea_reason, uint64_t syndrome, + void *cookie, void *handle, uint64_t flags) +{ + int32_t ret; + + ras_lock(); + + ERROR("exception reason=%u syndrome=0x%llx on 0x%lx at EL3.\n", + ea_reason, syndrome, read_mpidr_el1()); + + /* Call RAS EA handler */ + ret = ras_ea_handler(ea_reason, syndrome, cookie, handle, flags); + if (ret != 0) { + ERROR("RAS error handled!\n"); + ret = sdei_dispatch_event(TEGRA_SDEI_EP_EVENT_0 + + plat_my_core_pos()); + if (ret != 0) + ERROR("sdei_dispatch_event returned %d\n", ret); + } else { + ERROR("Not a RAS error!\n"); + } + + ras_unlock(); +} + +/* Function to enable uncorrectable errors as External abort (SError) */ +void tegra194_ras_enable(void) +{ + VERBOSE("%s\n", __func__); + + /* skip RAS enablement if not a silicon platform. */ + if (!tegra_platform_is_silicon()) { + return; + } + + /* + * Iterate for each group(num_idx ERRSELRs starting from idx_start) + * use normal for loop instead of for_each_err_record_info to get rid + * of MISRA noise.. + */ + for (uint32_t i = 0U; i < err_record_mappings.num_err_records; i++) { + + const struct err_record_info *info = &err_record_mappings.err_records[i]; + + uint32_t idx_start = info->sysreg.idx_start; + uint32_t num_idx = info->sysreg.num_idx; + const struct ras_aux_data *aux_data = (const struct ras_aux_data *)info->aux_data; + + assert(aux_data != NULL); + + for (uint32_t j = 0; j < num_idx; j++) { + uint64_t err_ctrl = 0ULL; + + /* enable SError reporting for uncorrectable error */ + ERR_CTLR_ENABLE_FIELD(err_ctrl, UE); + ERR_CTLR_ENABLE_FIELD(err_ctrl, ED); + + /* + * Catch error if something wrong with the RAS aux data + * record table. + */ + assert(aux_data[j].err_ctrl != NULL); + + /* enable the specified errors */ + err_ctrl |= aux_data[j].err_ctrl(); + + /* Write to ERRSELR_EL1 to select the error record */ + ser_sys_select_record(idx_start + j); + + /* enable specified errors */ + write_erxctlr_el1(err_ctrl); + + /* + * Check if all the bit settings have been enabled to detect + * uncorrected/corrected errors, if not assert. + */ + assert(read_erxctlr_el1() == err_ctrl); + } + } +} + +/* Function to probe an error from error record group. */ +static int32_t tegra194_ras_record_probe(const struct err_record_info *info, + int *probe_data) +{ + /* Skip probing if not a silicon platform */ + if (!tegra_platform_is_silicon()) { + return 0; + } + + return ser_probe_sysreg(info->sysreg.idx_start, info->sysreg.num_idx, probe_data); +} + +/* Function to handle error from one given node */ +static int32_t tegra194_ras_node_handler(const struct ras_error *errors, uint64_t status) +{ + bool found = false; + uint32_t ierr = (uint32_t)ERR_STATUS_GET_FIELD(status, IERR); + uint32_t serr = (uint32_t)ERR_STATUS_GET_FIELD(status, SERR); + + /* IERR to error message */ + for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { + if (ierr == errors[i].error_code) { + ERROR("IERR = %s(0x%x)\n", + errors[i].error_msg, errors[i].error_code); + found = true; + break; + } + } + if (!found) { + ERROR("unknown IERR: 0x%x\n", ierr); + } + + ERROR("SERR = %s(0x%x)\n", ras_serr_to_str(serr), serr); + + /* Write to clear reported errors. */ + write_erxstatus_el1(status); + + return 0; +} + +/* Function to handle one error node from an error record group. */ +static int32_t tegra194_ras_record_handler(const struct err_record_info *info, + int probe_data, const struct err_handler_data *const data) +{ + uint32_t num_idx = info->sysreg.num_idx; + uint32_t idx_start = info->sysreg.idx_start; + const struct ras_aux_data *aux_data = info->aux_data; + + uint64_t status = 0ULL; + + VERBOSE("%s\n", __func__); + + assert(probe_data >= 0); + assert((uint32_t)probe_data < num_idx); + + uint32_t offset = (uint32_t)probe_data; + const struct ras_error *errors = aux_data[offset].error_records; + + assert(errors != NULL); + + /* Write to ERRSELR_EL1 to select the error record */ + ser_sys_select_record(idx_start + offset); + + /* Retrieve status register from the error record */ + status = read_erxstatus_el1(); + + assert(ERR_STATUS_GET_FIELD(status, V) != 0U); + assert(ERR_STATUS_GET_FIELD(status, UE) != 0U); + + return tegra194_ras_node_handler(errors, status); +} + + +/* Instantiate RAS nodes */ +PER_CORE_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +PER_CLUSTER_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +SCF_L3_BANK_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) +CCPLEX_RAS_NODE_LIST(DEFINE_ONE_RAS_NODE) + +/* Instantiate RAS node groups */ +static struct ras_aux_data per_core_ras_group[] = { + PER_CORE_RAS_GROUP_NODES +}; + +static struct ras_aux_data per_cluster_ras_group[] = { + PER_CLUSTER_RAS_GROUP_NODES +}; + +static struct ras_aux_data scf_l3_ras_group[] = { + SCF_L3_BANK_RAS_GROUP_NODES +}; + +static struct ras_aux_data ccplex_ras_group[] = { + CCPLEX_RAS_GROUP_NODES +}; + +/* + * We have same probe and handler for each error record group, use a macro to + * simply the record definition. + */ +#define ADD_ONE_ERR_GROUP(errselr_start, group) \ + ERR_RECORD_SYSREG_V1((errselr_start), (uint32_t)ARRAY_SIZE((group)), \ + &tegra194_ras_record_probe, \ + &tegra194_ras_record_handler, (group)) + +/* RAS error record group information */ +static struct err_record_info carmel_ras_records[] = { + /* + * Per core ras error records + * ERRSELR starts from 0*256 + Logical_CPU_ID*16 + 0 to + * 0*256 + Logical_CPU_ID*16 + 5 for each group. + * 8 cores/groups, 6 * 8 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x000, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x010, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x020, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x030, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x040, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x050, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x060, per_core_ras_group), + ADD_ONE_ERR_GROUP(0x070, per_core_ras_group), + + /* + * Per cluster ras error records + * ERRSELR starts from 2*256 + Logical_Cluster_ID*16 + 0 to + * 2*256 + Logical_Cluster_ID*16 + 3. + * 4 clusters/groups, 3 * 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x200, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x210, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x220, per_cluster_ras_group), + ADD_ONE_ERR_GROUP(0x230, per_cluster_ras_group), + + /* + * SCF L3_Bank ras error records + * ERRSELR: 3*256 + L3_Bank_ID, L3_Bank_ID: 0-3 + * 1 groups, 4 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x300, scf_l3_ras_group), + + /* + * CCPLEX ras error records + * ERRSELR: 4*256 + Unit_ID, Unit_ID: 0 - 4 + * 1 groups, 5 nodes in total. + */ + ADD_ONE_ERR_GROUP(0x400, ccplex_ras_group), +}; + +REGISTER_ERR_RECORD_INFO(carmel_ras_records); + +/* dummy RAS interrupt */ +static struct ras_interrupt carmel_ras_interrupts[] = {}; +REGISTER_RAS_INTERRUPTS(carmel_ras_interrupts); + +/******************************************************************************* + * RAS handler for the platform + ******************************************************************************/ +void plat_ea_handler(unsigned int ea_reason, uint64_t syndrome, void *cookie, + void *handle, uint64_t flags) +{ +#if RAS_EXTENSION + tegra194_ea_handler(ea_reason, syndrome, cookie, handle, flags); +#else + ERROR("Unhandled External Abort received on 0x%llx at EL3!\n", + read_mpidr_el1()); + ERROR(" exception reason=%u syndrome=0x%lx\n", ea_reason, syndrome); + panic(); +#endif +} diff --git a/plat/nvidia/tegra/soc/t194/plat_setup.c b/plat/nvidia/tegra/soc/t194/plat_setup.c index 5d6c60b6c..399aebb05 100644 --- a/plat/nvidia/tegra/soc/t194/plat_setup.c +++ b/plat/nvidia/tegra/soc/t194/plat_setup.c @@ -208,6 +208,11 @@ void plat_early_platform_setup(void) /* sanity check MCE firmware compatibility */ mce_verify_firmware_version(); +#if RAS_EXTENSION + /* Enable Uncorrectable RAS error */ + tegra194_ras_enable(); +#endif + /* * Program XUSB STREAMIDs * ====================== diff --git a/plat/nvidia/tegra/soc/t194/platform_t194.mk b/plat/nvidia/tegra/soc/t194/platform_t194.mk index c02128ccc..d7d15f556 100644 --- a/plat/nvidia/tegra/soc/t194/platform_t194.mk +++ b/plat/nvidia/tegra/soc/t194/platform_t194.mk @@ -30,6 +30,10 @@ $(eval $(call add_define,MAX_XLAT_TABLES)) MAX_MMAP_REGIONS := 30 $(eval $(call add_define,MAX_MMAP_REGIONS)) +# enable RAS handling +HANDLE_EA_EL3_FIRST := 1 +RAS_EXTENSION := 1 + # platform files PLAT_INCLUDES += -Iplat/nvidia/tegra/include/t194 \ -I${SOC_DIR}/drivers/include @@ -56,3 +60,10 @@ BL31_SOURCES += drivers/ti/uart/aarch64/16550_console.S \ ifeq (${ENABLE_CONSOLE_SPE},1) BL31_SOURCES += ${COMMON_DIR}/drivers/spe/shared_console.S endif + +# RAS sources +ifeq (${RAS_EXTENSION},1) +BL31_SOURCES += lib/extensions/ras/std_err_record.c \ + lib/extensions/ras/ras_common.c \ + ${SOC_DIR}/plat_ras.c +endif From 0d8511953e19a5da80ac1a0ed9ec8e76b57a33a8 Mon Sep 17 00:00:00 2001 From: Varun Wadekar Date: Thu, 21 Mar 2019 08:23:05 -0700 Subject: [PATCH 2/4] Tegra194: SiP: clear RAS corrected error records This patch introduces a function ID to clear all the RAS error records for corrected errors. Per latest requirement, ARM RAS corrected errors will be reported to lower ELs via interrupts and cleared via SMC. This patch provides required function to clear RAS error status. This patch also sets up all required RAS Corrected errors in order to route RAS corrected errors to lower ELs. Change-Id: I554ba1d0797b736835aa27824782703682c91e51 Signed-off-by: Varun Wadekar Signed-off-by: David Pu --- plat/nvidia/tegra/include/tegra_private.h | 1 + .../soc/t194/drivers/include/mce_private.h | 2 + plat/nvidia/tegra/soc/t194/drivers/mce/mce.c | 8 + plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c | 12 ++ plat/nvidia/tegra/soc/t194/plat_ras.c | 145 ++++++++++++++---- plat/nvidia/tegra/soc/t194/plat_sip_calls.c | 11 ++ 6 files changed, 149 insertions(+), 30 deletions(-) diff --git a/plat/nvidia/tegra/include/tegra_private.h b/plat/nvidia/tegra/include/tegra_private.h index a6d8e6886..c181c3618 100644 --- a/plat/nvidia/tegra/include/tegra_private.h +++ b/plat/nvidia/tegra/include/tegra_private.h @@ -159,6 +159,7 @@ int plat_sip_handler(uint32_t smc_fid, #if RAS_EXTENSION void tegra194_ras_enable(void); +void tegra194_ras_corrected_err_clear(void); #endif #endif /* TEGRA_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h b/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h index 1fe3aad39..6dafeb246 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h +++ b/plat/nvidia/tegra/soc/t194/drivers/include/mce_private.h @@ -58,6 +58,7 @@ int32_t nvg_roc_clean_cache_trbits(void); void nvg_enable_strict_checking_mode(void); void nvg_system_shutdown(void); void nvg_system_reboot(void); +void nvg_clear_hsm_corr_status(void); /* declarations for assembly functions */ void nvg_set_request_data(uint64_t req, uint64_t data); @@ -71,5 +72,6 @@ uint64_t nvg_cache_inval_all(void); void mce_enable_strict_checking(void); void mce_system_shutdown(void); void mce_system_reboot(void); +void mce_clear_hsm_corr_status(void); #endif /* MCE_PRIVATE_H */ diff --git a/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c b/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c index 7edd7a09e..4663a3d27 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c +++ b/plat/nvidia/tegra/soc/t194/drivers/mce/mce.c @@ -234,3 +234,11 @@ void mce_system_reboot(void) { nvg_system_reboot(); } + +/******************************************************************************* + * Handler to clear CCPLEX->HSM correctable RAS error signal. + ******************************************************************************/ +void mce_clear_hsm_corr_status(void) +{ + nvg_clear_hsm_corr_status(); +} diff --git a/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c b/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c index ef740a143..fdf94292c 100644 --- a/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c +++ b/plat/nvidia/tegra/soc/t194/drivers/mce/nvg.c @@ -236,3 +236,15 @@ void nvg_system_shutdown(void) nvg_set_request_data((uint64_t)TEGRA_NVG_CHANNEL_SHUTDOWN, (uint64_t)TEGRA_NVG_SHUTDOWN); } + +/* + * Request to clear CCPLEX->HSM correctable error signal. + * NVGDATA[1]: A write of 1 clears the CCPLEX->HSM correctable error signal, + * A write of 0 has no effect. + */ +void nvg_clear_hsm_corr_status(void) +{ + nvg_hsm_error_ctrl_channel_t status = { .bits = { .corr = 1U, }, }; + + nvg_set_request_data((uint64_t)TEGRA_NVG_CHANNEL_HSM_ERROR_CTRL, status.flat); +} diff --git a/plat/nvidia/tegra/soc/t194/plat_ras.c b/plat/nvidia/tegra/soc/t194/plat_ras.c index f9ebb37e4..eb896a4af 100644 --- a/plat/nvidia/tegra/soc/t194/plat_ras.c +++ b/plat/nvidia/tegra/soc/t194/plat_ras.c @@ -60,7 +60,12 @@ static void tegra194_ea_handler(unsigned int ea_reason, uint64_t syndrome, ras_unlock(); } -/* Function to enable uncorrectable errors as External abort (SError) */ +/* + * Function to enable all supported RAS error report. + * + * Uncorrected errors are set to report as External abort (SError) + * Corrected errors are set to report as interrupt. + */ void tegra194_ras_enable(void) { VERBOSE("%s\n", __func__); @@ -86,11 +91,15 @@ void tegra194_ras_enable(void) assert(aux_data != NULL); for (uint32_t j = 0; j < num_idx; j++) { - uint64_t err_ctrl = 0ULL; - /* enable SError reporting for uncorrectable error */ - ERR_CTLR_ENABLE_FIELD(err_ctrl, UE); - ERR_CTLR_ENABLE_FIELD(err_ctrl, ED); + /* ERRCTLR register value. */ + uint64_t err_ctrl = 0ULL; + /* all supported errors for this node. */ + uint64_t err_fr; + /* uncorrectable errors */ + uint64_t uncorr_errs; + /* correctable errors */ + uint64_t corr_errs; /* * Catch error if something wrong with the RAS aux data @@ -98,13 +107,37 @@ void tegra194_ras_enable(void) */ assert(aux_data[j].err_ctrl != NULL); - /* enable the specified errors */ - err_ctrl |= aux_data[j].err_ctrl(); - - /* Write to ERRSELR_EL1 to select the error record */ + /* + * Write to ERRSELR_EL1 to select the RAS error node. + * Always program this at first to select corresponding + * RAS node before any other RAS register r/w. + */ ser_sys_select_record(idx_start + j); - /* enable specified errors */ + err_fr = read_erxfr_el1() & ERR_FR_EN_BITS_MASK; + uncorr_errs = aux_data[j].err_ctrl(); + corr_errs = ~uncorr_errs & err_fr; + + /* enable error reporting */ + ERR_CTLR_ENABLE_FIELD(err_ctrl, ED); + + /* enable SError reporting for uncorrectable errors */ + if ((uncorr_errs & err_fr) != 0ULL) { + ERR_CTLR_ENABLE_FIELD(err_ctrl, UE); + } + + /* generate interrupt for corrected errors. */ + if (corr_errs != 0ULL) { + ERR_CTLR_ENABLE_FIELD(err_ctrl, CFI); + } + + /* enable the supported errors */ + err_ctrl |= err_fr; + + VERBOSE("errselr_el1:0x%x, erxfr:0x%llx, err_ctrl:0x%llx\n", + idx_start + j, err_fr, err_ctrl); + + /* enable specified errors, or set to 0 if no supported error */ write_erxctlr_el1(err_ctrl); /* @@ -116,6 +149,42 @@ void tegra194_ras_enable(void) } } +/* + * Function to clear RAS ERRSTATUS for corrected RAS error. + * This function ignores any new RAS error signaled during clearing; it is not + * multi-core safe(no ras_lock is taken to reduce overhead). + */ +void tegra194_ras_corrected_err_clear(void) +{ + uint64_t clear_ce_status = 0ULL; + + ERR_STATUS_SET_FIELD(clear_ce_status, AV, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, V, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, OF, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, MV, 0x1UL); + ERR_STATUS_SET_FIELD(clear_ce_status, CE, 0x3UL); + + for (uint32_t i = 0U; i < err_record_mappings.num_err_records; i++) { + + const struct err_record_info *info = &err_record_mappings.err_records[i]; + uint32_t idx_start = info->sysreg.idx_start; + uint32_t num_idx = info->sysreg.num_idx; + + for (uint32_t j = 0U; j < num_idx; j++) { + + uint64_t status; + uint32_t err_idx = idx_start + j; + + write_errselr_el1(err_idx); + status = read_erxstatus_el1(); + + if (ERR_STATUS_GET_FIELD(status, CE) != 0U) { + write_erxstatus_el1(clear_ce_status); + } + } + } +} + /* Function to probe an error from error record group. */ static int32_t tegra194_ras_record_probe(const struct err_record_info *info, int *probe_data) @@ -129,26 +198,43 @@ static int32_t tegra194_ras_record_probe(const struct err_record_info *info, } /* Function to handle error from one given node */ -static int32_t tegra194_ras_node_handler(const struct ras_error *errors, uint64_t status) +static int32_t tegra194_ras_node_handler(uint32_t errselr, + const struct ras_error *errors, uint64_t status) { bool found = false; uint32_t ierr = (uint32_t)ERR_STATUS_GET_FIELD(status, IERR); uint32_t serr = (uint32_t)ERR_STATUS_GET_FIELD(status, SERR); - /* IERR to error message */ - for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { - if (ierr == errors[i].error_code) { - ERROR("IERR = %s(0x%x)\n", - errors[i].error_msg, errors[i].error_code); - found = true; - break; - } - } - if (!found) { - ERROR("unknown IERR: 0x%x\n", ierr); + /* not a valid error. */ + if (ERR_STATUS_GET_FIELD(status, V) == 0U) { + return 0; } - ERROR("SERR = %s(0x%x)\n", ras_serr_to_str(serr), serr); + /* Print uncorrectable errror information. */ + if (ERR_STATUS_GET_FIELD(status, UE) != 0U) { + + /* IERR to error message */ + for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { + if (ierr == errors[i].error_code) { + ERROR("ERRSELR_EL1:0x%x\n, IERR = %s(0x%x)\n", + errselr, errors[i].error_msg, + errors[i].error_code); + found = true; + break; + } + } + + if (!found) { + ERROR("unknown uncorrectable eror, " + "ERRSELR_EL1:0x%x, IERR: 0x%x\n", errselr, ierr); + } + + ERROR("SERR = %s(0x%x)\n", ras_serr_to_str(serr), serr); + } else { + /* For corrected error, simply clear it. */ + VERBOSE("corrected RAS error is cleared: ERRSELR_EL1:0x%x, " + "IERR:0x%x, SERR:0x%x\n", errselr, ierr, serr); + } /* Write to clear reported errors. */ write_erxstatus_el1(status); @@ -158,11 +244,13 @@ static int32_t tegra194_ras_node_handler(const struct ras_error *errors, uint64_ /* Function to handle one error node from an error record group. */ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, - int probe_data, const struct err_handler_data *const data) + int probe_data, const struct err_handler_data *const data __unused) { uint32_t num_idx = info->sysreg.num_idx; uint32_t idx_start = info->sysreg.idx_start; const struct ras_aux_data *aux_data = info->aux_data; + const struct ras_error *errors; + uint32_t offset; uint64_t status = 0ULL; @@ -171,8 +259,8 @@ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, assert(probe_data >= 0); assert((uint32_t)probe_data < num_idx); - uint32_t offset = (uint32_t)probe_data; - const struct ras_error *errors = aux_data[offset].error_records; + offset = (uint32_t)probe_data; + errors = aux_data[offset].error_records; assert(errors != NULL); @@ -182,10 +270,7 @@ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, /* Retrieve status register from the error record */ status = read_erxstatus_el1(); - assert(ERR_STATUS_GET_FIELD(status, V) != 0U); - assert(ERR_STATUS_GET_FIELD(status, UE) != 0U); - - return tegra194_ras_node_handler(errors, status); + return tegra194_ras_node_handler(idx_start + offset, errors, status); } diff --git a/plat/nvidia/tegra/soc/t194/plat_sip_calls.c b/plat/nvidia/tegra/soc/t194/plat_sip_calls.c index 884762de7..a3f996d45 100644 --- a/plat/nvidia/tegra/soc/t194/plat_sip_calls.c +++ b/plat/nvidia/tegra/soc/t194/plat_sip_calls.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -23,6 +24,7 @@ * Tegra194 SiP SMCs ******************************************************************************/ #define TEGRA_SIP_GET_SMMU_PER 0xC200FF00U +#define TEGRA_SIP_CLEAR_RAS_CORRECTED_ERRORS 0xC200FF01U /******************************************************************************* * This function is responsible for handling all T194 SiP calls @@ -69,6 +71,15 @@ int32_t plat_sip_handler(uint32_t smc_fid, break; +#if RAS_EXTENSION + case TEGRA_SIP_CLEAR_RAS_CORRECTED_ERRORS: + /* clear all RAS error records for corrected errors at first. */ + tegra194_ras_corrected_err_clear(); + /* clear HSM corrected error status. */ + mce_clear_hsm_corr_status(); + break; +#endif + default: ret = -ENOTSUP; break; From fbc44bd1bbbafe01848afd009d507b595b264b5f Mon Sep 17 00:00:00 2001 From: Varun Wadekar Date: Fri, 12 Jun 2020 10:11:28 -0700 Subject: [PATCH 3/4] Prevent RAS register access from lower ELs This patch adds a build config 'RAS_TRAP_LOWER_EL_ERR_ACCESS' to set SCR_EL3.TERR during CPU boot. This bit enables trapping RAS register accesses from EL1 or EL2 to EL3. RAS_TRAP_LOWER_EL_ERR_ACCESS is disabled by default. Signed-off-by: Varun Wadekar Change-Id: Ifb0fb0afedea7dd2a29a0b0491a1161ecd241438 --- Makefile | 2 ++ docs/components/ras.rst | 3 ++- docs/getting_started/build-options.rst | 4 ++++ include/arch/aarch64/arch.h | 1 + lib/el3_runtime/aarch64/context_mgmt.c | 8 ++++++++ make_helpers/defaults.mk | 3 +++ 6 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index a0d2ae066..bc5604be2 100644 --- a/Makefile +++ b/Makefile @@ -900,6 +900,7 @@ $(eval $(call assert_boolean,USE_SPINLOCK_CAS)) $(eval $(call assert_boolean,ENCRYPT_BL31)) $(eval $(call assert_boolean,ENCRYPT_BL32)) $(eval $(call assert_boolean,ERRATA_SPECULATIVE_AT)) +$(eval $(call assert_boolean,RAS_TRAP_LOWER_EL_ERR_ACCESS)) $(eval $(call assert_numeric,ARM_ARCH_MAJOR)) $(eval $(call assert_numeric,ARM_ARCH_MINOR)) @@ -979,6 +980,7 @@ $(eval $(call add_define,BL2_IN_XIP_MEM)) $(eval $(call add_define,BL2_INV_DCACHE)) $(eval $(call add_define,USE_SPINLOCK_CAS)) $(eval $(call add_define,ERRATA_SPECULATIVE_AT)) +$(eval $(call add_define,RAS_TRAP_LOWER_EL_ERR_ACCESS)) ifeq (${SANITIZE_UB},trap) $(eval $(call add_define,MONITOR_TRAPS)) diff --git a/docs/components/ras.rst b/docs/components/ras.rst index 3d81f17e9..86529d740 100644 --- a/docs/components/ras.rst +++ b/docs/components/ras.rst @@ -32,7 +32,8 @@ introduced by the RAS extensions. The build option ``RAS_EXTENSION`` when set to ``1`` includes the RAS in run time firmware; ``EL3_EXCEPTION_HANDLING`` and ``HANDLE_EA_EL3_FIRST`` must also -be set ``1``. +be set ``1``. ``RAS_TRAP_LOWER_EL_ERR_ACCESS`` controls the access to the RAS +error record registers from lower ELs. .. _ras-figure: diff --git a/docs/getting_started/build-options.rst b/docs/getting_started/build-options.rst index 920f934af..f207886fb 100644 --- a/docs/getting_started/build-options.rst +++ b/docs/getting_started/build-options.rst @@ -707,6 +707,10 @@ Common build options | 1530924 | Cortex-A53 | +---------+--------------+ +- ``RAS_TRAP_LOWER_EL_ERR_ACCESS``: This flag enables/disables the SCR_EL3.TERR + bit, to trap access to the RAS ERR and RAS ERX registers from lower ELs. + This flag is disabled by default. + GICv3 driver options -------------------- diff --git a/include/arch/aarch64/arch.h b/include/arch/aarch64/arch.h index 10fe926ea..90569c3cf 100644 --- a/include/arch/aarch64/arch.h +++ b/include/arch/aarch64/arch.h @@ -342,6 +342,7 @@ #define SCR_EEL2_BIT (U(1) << 18) #define SCR_API_BIT (U(1) << 17) #define SCR_APK_BIT (U(1) << 16) +#define SCR_TERR_BIT (U(1) << 15) #define SCR_TWE_BIT (U(1) << 13) #define SCR_TWI_BIT (U(1) << 12) #define SCR_ST_BIT (U(1) << 11) diff --git a/lib/el3_runtime/aarch64/context_mgmt.c b/lib/el3_runtime/aarch64/context_mgmt.c index 53b4ea3e3..f4a34bfaa 100644 --- a/lib/el3_runtime/aarch64/context_mgmt.c +++ b/lib/el3_runtime/aarch64/context_mgmt.c @@ -108,6 +108,14 @@ void cm_setup_context(cpu_context_t *ctx, const entry_point_info_t *ep) if (EP_GET_ST(ep->h.attr) != 0U) scr_el3 |= SCR_ST_BIT; +#if RAS_TRAP_LOWER_EL_ERR_ACCESS + /* + * SCR_EL3.TERR: Trap Error record accesses. Accesses to the RAS ERR + * and RAS ERX registers from EL1 and EL2 are trapped to EL3. + */ + scr_el3 |= SCR_TERR_BIT; +#endif + #if !HANDLE_EA_EL3_FIRST /* * SCR_EL3.EA: Do not route External Abort and SError Interrupt External diff --git a/make_helpers/defaults.mk b/make_helpers/defaults.mk index 585f06fcc..6db228f2d 100644 --- a/make_helpers/defaults.mk +++ b/make_helpers/defaults.mk @@ -302,3 +302,6 @@ SUPPORT_STACK_MEMTAG := no # Select workaround for AT speculative behaviour. ERRATA_SPECULATIVE_AT := 0 + +# Trap RAS error record access from lower EL +RAS_TRAP_LOWER_EL_ERR_ACCESS := 0 From fba5cdc69569a5b62cbd4303b91bb2d41d335566 Mon Sep 17 00:00:00 2001 From: David Pu Date: Thu, 16 May 2019 17:20:27 -0700 Subject: [PATCH 4/4] Tegra194: ras: verbose prints for SErrors This patch provides verbose prints for RAS SErrors handled by the firmware, for improved debugging. Change-Id: Iaad8d183054d884f606dc4621da2cc6b2375bcf9 Signed-off-by: David Pu Signed-off-by: Varun Wadekar --- .../tegra/include/t194/tegra194_ras_private.h | 3 + plat/nvidia/tegra/soc/t194/plat_ras.c | 70 +++++++++++++++---- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/plat/nvidia/tegra/include/t194/tegra194_ras_private.h b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h index c867b9d2b..336461af3 100644 --- a/plat/nvidia/tegra/include/t194/tegra194_ras_private.h +++ b/plat/nvidia/tegra/include/t194/tegra194_ras_private.h @@ -18,6 +18,8 @@ struct ras_error { /* RAS error node-specific auxiliary data */ struct ras_aux_data { + /* name for current RAS node. */ + const char *name; /* point to null-terminated ras_error array to convert error code to msg. */ const struct ras_error *error_records; /* @@ -218,6 +220,7 @@ static inline uint64_t node##_err_ctrl(void) \ #define DEFINE_ONE_RAS_AUX_DATA(node) \ { \ + .name = #node, \ .error_records = node##_uncorr_ras_errors, \ .err_ctrl = &node##_err_ctrl \ }, diff --git a/plat/nvidia/tegra/soc/t194/plat_ras.c b/plat/nvidia/tegra/soc/t194/plat_ras.c index eb896a4af..54c2924c7 100644 --- a/plat/nvidia/tegra/soc/t194/plat_ras.c +++ b/plat/nvidia/tegra/soc/t194/plat_ras.c @@ -42,8 +42,8 @@ static void tegra194_ea_handler(unsigned int ea_reason, uint64_t syndrome, ras_lock(); - ERROR("exception reason=%u syndrome=0x%llx on 0x%lx at EL3.\n", - ea_reason, syndrome, read_mpidr_el1()); + ERROR("MPIDR 0x%lx: exception reason=%u syndrome=0x%llx\n", + read_mpidr(), ea_reason, syndrome); /* Call RAS EA handler */ ret = ras_ea_handler(ea_reason, syndrome, cookie, handle, flags); @@ -198,47 +198,90 @@ static int32_t tegra194_ras_record_probe(const struct err_record_info *info, } /* Function to handle error from one given node */ -static int32_t tegra194_ras_node_handler(uint32_t errselr, +static int32_t tegra194_ras_node_handler(uint32_t errselr, const char *name, const struct ras_error *errors, uint64_t status) { bool found = false; uint32_t ierr = (uint32_t)ERR_STATUS_GET_FIELD(status, IERR); uint32_t serr = (uint32_t)ERR_STATUS_GET_FIELD(status, SERR); + uint64_t val = 0; /* not a valid error. */ if (ERR_STATUS_GET_FIELD(status, V) == 0U) { return 0; } + ERR_STATUS_SET_FIELD(val, V, 1); + + /* keep the log print same as linux arm64_ras driver. */ + ERROR("**************************************\n"); + ERROR("RAS Error in %s, ERRSELR_EL1=0x%x:\n", name, errselr); + ERROR("\tStatus = 0x%llx\n", status); + /* Print uncorrectable errror information. */ if (ERR_STATUS_GET_FIELD(status, UE) != 0U) { + ERR_STATUS_SET_FIELD(val, UE, 1); + ERR_STATUS_SET_FIELD(val, UET, 1); + /* IERR to error message */ for (uint32_t i = 0; errors[i].error_msg != NULL; i++) { if (ierr == errors[i].error_code) { - ERROR("ERRSELR_EL1:0x%x\n, IERR = %s(0x%x)\n", - errselr, errors[i].error_msg, - errors[i].error_code); + ERROR("\tIERR = %s: 0x%x\n", + errors[i].error_msg, ierr); + found = true; break; } } if (!found) { - ERROR("unknown uncorrectable eror, " - "ERRSELR_EL1:0x%x, IERR: 0x%x\n", errselr, ierr); + ERROR("\tUnknown IERR: 0x%x\n", ierr); + } + + ERROR("SERR = %s: 0x%x\n", ras_serr_to_str(serr), serr); + + /* Overflow, multiple errors have been detected. */ + if (ERR_STATUS_GET_FIELD(status, OF) != 0U) { + ERROR("\tOverflow (there may be more errors) - " + "Uncorrectable\n"); + ERR_STATUS_SET_FIELD(val, OF, 1); + } + + ERROR("\tUncorrectable (this is fatal)\n"); + + /* Miscellaneous Register Valid. */ + if (ERR_STATUS_GET_FIELD(status, MV) != 0U) { + ERROR("\tMISC0 = 0x%lx\n", read_erxmisc0_el1()); + ERROR("\tMISC1 = 0x%lx\n", read_erxmisc1_el1()); + ERR_STATUS_SET_FIELD(val, MV, 1); + } + + /* Address Valid. */ + if (ERR_STATUS_GET_FIELD(status, AV) != 0U) { + ERROR("\tADDR = 0x%lx\n", read_erxaddr_el1()); + ERR_STATUS_SET_FIELD(val, AV, 1); + } + + /* Deferred error */ + if (ERR_STATUS_GET_FIELD(status, DE) != 0U) { + ERROR("\tDeferred error\n"); + ERR_STATUS_SET_FIELD(val, DE, 1); } - ERROR("SERR = %s(0x%x)\n", ras_serr_to_str(serr), serr); } else { /* For corrected error, simply clear it. */ VERBOSE("corrected RAS error is cleared: ERRSELR_EL1:0x%x, " "IERR:0x%x, SERR:0x%x\n", errselr, ierr, serr); + ERR_STATUS_SET_FIELD(val, CE, 1); } - /* Write to clear reported errors. */ - write_erxstatus_el1(status); + ERROR("**************************************\n"); + /* Write to clear reported errors. */ + write_erxstatus_el1(val); + + /* error handled */ return 0; } @@ -251,6 +294,7 @@ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, const struct ras_aux_data *aux_data = info->aux_data; const struct ras_error *errors; uint32_t offset; + const char *node_name; uint64_t status = 0ULL; @@ -261,6 +305,7 @@ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, offset = (uint32_t)probe_data; errors = aux_data[offset].error_records; + node_name = aux_data[offset].name; assert(errors != NULL); @@ -270,7 +315,8 @@ static int32_t tegra194_ras_record_handler(const struct err_record_info *info, /* Retrieve status register from the error record */ status = read_erxstatus_el1(); - return tegra194_ras_node_handler(idx_start + offset, errors, status); + return tegra194_ras_node_handler(idx_start + offset, node_name, + errors, status); }