diff --git a/bl1/aarch64/bl1_entrypoint.S b/bl1/aarch64/bl1_entrypoint.S
index 50cfae65c..ac6d91331 100644
--- a/bl1/aarch64/bl1_entrypoint.S
+++ b/bl1/aarch64/bl1_entrypoint.S
@@ -130,14 +130,16 @@ func bl1_entrypoint
 	ldr	x2, =__DATA_SIZE__
 	bl	memcpy16
 
-	/* ---------------------------------------------
-	 * Give ourselves a small coherent stack to
-	 * ease the pain of initializing the MMU and
-	 * CCI in assembler
-	 * ---------------------------------------------
+	/* --------------------------------------------
+	 * Allocate a stack whose memory will be marked
+	 * as Normal-IS-WBWA when the MMU is enabled.
+	 * There is no risk of reading stale stack
+	 * memory after enabling the MMU as only the
+	 * primary cpu is running at the moment.
+	 * --------------------------------------------
 	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
 
 	/* ---------------------------------------------
 	 * Architectural init. can be generic e.g.
@@ -150,14 +152,6 @@ func bl1_entrypoint
 	bl	bl1_early_platform_setup
 	bl	bl1_plat_arch_setup
 
-	/* ---------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
-	 * ---------------------------------------------
-	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
-
 	/* --------------------------------------------------
 	 * Initialize platform and jump to our c-entry point
 	 * for this type of reset. Panic if it returns
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index 09eadff2e..c615baf60 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -96,12 +96,15 @@ func bl2_entrypoint
 	bl	zeromem16
 
 	/* --------------------------------------------
-	 * Give ourselves a small coherent stack to
-	 * ease the pain of initializing the MMU
+	 * Allocate a stack whose memory will be marked
+	 * as Normal-IS-WBWA when the MMU is enabled.
+	 * There is no risk of reading stale stack
+	 * memory after enabling the MMU as only the
+	 * primary cpu is running at the moment.
 	 * --------------------------------------------
 	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
 
 	/* ---------------------------------------------
 	 * Perform early platform setup & platform
@@ -112,14 +115,6 @@ func bl2_entrypoint
 	bl	bl2_early_platform_setup
 	bl	bl2_plat_arch_setup
 
-	/* ---------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
-	 * ---------------------------------------------
-	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
-
 	/* ---------------------------------------------
 	 * Jump to main function.
 	 * ---------------------------------------------
diff --git a/bl31/aarch64/bl31_entrypoint.S b/bl31/aarch64/bl31_entrypoint.S
index 6e48e3138..102398377 100644
--- a/bl31/aarch64/bl31_entrypoint.S
+++ b/bl31/aarch64/bl31_entrypoint.S
@@ -152,12 +152,15 @@ func bl31_entrypoint
 	msr	spsel, #0
 
 	/* --------------------------------------------
-	 * Give ourselves a small coherent stack to
-	 * ease the pain of initializing the MMU
+	 * Allocate a stack whose memory will be marked
+	 * as Normal-IS-WBWA when the MMU is enabled.
+	 * There is no risk of reading stale stack
+	 * memory after enabling the MMU as only the
+	 * primary cpu is running at the moment.
 	 * --------------------------------------------
 	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
 
 	/* ---------------------------------------------
 	 * Perform platform specific early arch. setup
@@ -174,14 +177,6 @@ func bl31_entrypoint
 	bl	bl31_early_platform_setup
 	bl	bl31_plat_arch_setup
 
-	/* ---------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
-	 * ---------------------------------------------
-	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
-
 	/* ---------------------------------------------
 	 * Jump to main function.
 	 * ---------------------------------------------
diff --git a/bl31/aarch64/context.S b/bl31/aarch64/context.S
index 269821574..79b5d19cd 100644
--- a/bl31/aarch64/context.S
+++ b/bl31/aarch64/context.S
@@ -43,23 +43,9 @@
 	.global el3_sysregs_context_save
 func el3_sysregs_context_save
 
-	mrs	x10, sctlr_el3
-	str	x10, [x0, #CTX_SCTLR_EL3]
-
-	mrs	x11, cptr_el3
-	stp	x11, xzr, [x0, #CTX_CPTR_EL3]
-
-	mrs	x13, cntfrq_el0
-	mrs	x14, mair_el3
-	stp	x13, x14, [x0, #CTX_CNTFRQ_EL0]
-
-	mrs	x15, tcr_el3
-	mrs	x16, ttbr0_el3
-	stp	x15, x16, [x0, #CTX_TCR_EL3]
-
-	mrs	x17, daif
-	and	x17, x17, #(DAIF_ABT_BIT | DAIF_DBG_BIT)
-	stp	x17, xzr, [x0, #CTX_DAIF_EL3]
+	mrs	x10, cptr_el3
+	mrs	x11, cntfrq_el0
+	stp	x10, x11, [x0, #CTX_CPTR_EL3]
 
 	ret
 
@@ -78,27 +64,9 @@ func el3_sysregs_context_save
 	.global el3_sysregs_context_restore
 func el3_sysregs_context_restore
 
-	ldp	x11, xzr, [x0, #CTX_CPTR_EL3]
-	msr	cptr_el3, x11
-
-	ldp	x13, x14, [x0, #CTX_CNTFRQ_EL0]
-	msr	cntfrq_el0, x13
-	msr	mair_el3, x14
-
-	ldp	x15, x16, [x0, #CTX_TCR_EL3]
-	msr	tcr_el3, x15
-	msr	ttbr0_el3, x16
-
-	ldp	x17, xzr, [x0, #CTX_DAIF_EL3]
-	mrs	x11, daif
-	orr	x17, x17, x11
-	msr	daif, x17
-
-	/* Make sure all the above changes are observed */
-	isb
-
-	ldr	x10, [x0, #CTX_SCTLR_EL3]
-	msr	sctlr_el3, x10
+	ldp	x13, x14, [x0, #CTX_CPTR_EL3]
+	msr	cptr_el3, x13
+	msr	cntfrq_el0, x14
 	isb
 
 	ret
diff --git a/bl31/aarch64/runtime_exceptions.S b/bl31/aarch64/runtime_exceptions.S
index a11cd71f1..0ab86ab6b 100644
--- a/bl31/aarch64/runtime_exceptions.S
+++ b/bl31/aarch64/runtime_exceptions.S
@@ -403,7 +403,7 @@ smc_handler64:
 	mrs	x17, elr_el3
 	mrs	x18, scr_el3
 	stp	x16, x17, [x6, #CTX_EL3STATE_OFFSET + CTX_SPSR_EL3]
-	stp	x18, xzr, [x6, #CTX_EL3STATE_OFFSET + CTX_SCR_EL3]
+	str	x18, [x6, #CTX_EL3STATE_OFFSET + CTX_SCR_EL3]
 
 	/* Copy SCR_EL3.NS bit to the flag to indicate caller's security */
 	bfi	x7, x18, #0, #1
@@ -446,7 +446,7 @@ el3_exit: ; .type el3_exit, %function
 	 * Restore SPSR_EL3, ELR_EL3 and SCR_EL3 prior to ERET
 	 * -----------------------------------------------------
 	 */
-	ldp	x18, xzr, [sp, #CTX_EL3STATE_OFFSET + CTX_SCR_EL3]
+	ldr	x18, [sp, #CTX_EL3STATE_OFFSET + CTX_SCR_EL3]
 	ldp	x16, x17, [sp, #CTX_EL3STATE_OFFSET + CTX_SPSR_EL3]
 	msr	scr_el3, x18
 	msr	spsr_el3, x16
diff --git a/bl31/bl31.mk b/bl31/bl31.mk
index 5555c319c..fb17a2e60 100644
--- a/bl31/bl31.mk
+++ b/bl31/bl31.mk
@@ -48,6 +48,7 @@ BL31_SOURCES		+=	bl31/bl31_main.c				\
 				services/std_svc/psci/psci_afflvl_suspend.c	\
 				services/std_svc/psci/psci_common.c		\
 				services/std_svc/psci/psci_entry.S		\
+				services/std_svc/psci/psci_helpers.S		\
 				services/std_svc/psci/psci_main.c		\
 				services/std_svc/psci/psci_setup.c
 
diff --git a/bl32/tsp/aarch64/tsp_entrypoint.S b/bl32/tsp/aarch64/tsp_entrypoint.S
index 479ca59b3..7a1797eef 100644
--- a/bl32/tsp/aarch64/tsp_entrypoint.S
+++ b/bl32/tsp/aarch64/tsp_entrypoint.S
@@ -31,6 +31,7 @@
 #include <arch.h>
 #include <asm_macros.S>
 #include <tsp.h>
+#include <xlat_tables.h>
 
 
 	.globl	tsp_entrypoint
@@ -111,12 +112,15 @@ func tsp_entrypoint
 	bl	zeromem16
 
 	/* --------------------------------------------
-	 * Give ourselves a small coherent stack to
-	 * ease the pain of initializing the MMU
+	 * Allocate a stack whose memory will be marked
+	 * as Normal-IS-WBWA when the MMU is enabled.
+	 * There is no risk of reading stale stack
+	 * memory after enabling the MMU as only the
+	 * primary cpu is running at the moment.
 	 * --------------------------------------------
 	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
 
 	/* ---------------------------------------------
 	 * Perform early platform setup & platform
@@ -126,14 +130,6 @@ func tsp_entrypoint
 	bl	bl32_early_platform_setup
 	bl	bl32_plat_arch_setup
 
-	/* ---------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
-	 * ---------------------------------------------
-	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
-
 	/* ---------------------------------------------
 	 * Jump to main function.
 	 * ---------------------------------------------
@@ -209,26 +205,48 @@ func tsp_cpu_on_entry
 	isb
 
 	/* --------------------------------------------
-	 * Give ourselves a small coherent stack to
-	 * ease the pain of initializing the MMU
+	 * Give ourselves a stack whose memory will be
+	 * marked as Normal-IS-WBWA when the MMU is
+	 * enabled.
 	 * --------------------------------------------
 	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
 
-	/* ---------------------------------------------
-	 * Initialise the MMU
-	 * ---------------------------------------------
+	/* --------------------------------------------
+	 * Enable the MMU with the DCache disabled. It
+	 * is safe to use stacks allocated in normal
+	 * memory as a result. All memory accesses are
+	 * marked nGnRnE when the MMU is disabled. So
+	 * all the stack writes will make it to memory.
+	 * All memory accesses are marked Non-cacheable
+	 * when the MMU is enabled but D$ is disabled.
+	 * So used stack memory is guaranteed to be
+	 * visible immediately after the MMU is enabled
+	 * Enabling the DCache at the same time as the
+	 * MMU can lead to speculatively fetched and
+	 * possibly stale stack memory being read from
+	 * other caches. This can lead to coherency
+	 * issues.
+	 * --------------------------------------------
 	 */
+	mov	x0, #DISABLE_DCACHE
 	bl	bl32_plat_enable_mmu
 
 	/* ---------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
+	 * Enable the Data cache now that the MMU has
+	 * been enabled. The stack has been unwound. It
+	 * will be written first before being read. This
+	 * will invalidate any stale cache lines resi-
+	 * -dent in other caches. We assume that
+	 * interconnect coherency has been enabled for
+	 * this cluster by EL3 firmware.
 	 * ---------------------------------------------
 	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
+	mrs	x0, sctlr_el1
+	orr	x0, x0, #SCTLR_C_BIT
+	msr	sctlr_el1, x0
+	isb
 
 	/* ---------------------------------------------
 	 * Enter C runtime to perform any remaining
diff --git a/docs/porting-guide.md b/docs/porting-guide.md
index 813d0be6c..2bd1b5611 100644
--- a/docs/porting-guide.md
+++ b/docs/porting-guide.md
@@ -104,12 +104,6 @@ file is found in [plat/fvp/include/platform_def.h].
     by [plat/common/aarch64/platform_mp_stack.S] and
     [plat/common/aarch64/platform_up_stack.S].
 
-*   **#define : PCPU_DV_MEM_STACK_SIZE**
-
-    Defines the coherent stack memory available to each CPU. This constant is used
-    by [plat/common/aarch64/platform_mp_stack.S] and
-    [plat/common/aarch64/platform_up_stack.S].
-
 *   **#define : FIRMWARE_WELCOME_STR**
 
     Defines the character string printed by BL1 upon entry into the `bl1_main()`
@@ -395,31 +389,6 @@ maximum of 4 CPUs:
     cluster_id = 8-bit value in MPIDR at affinity level 1
 
 
-### Function : platform_set_coherent_stack()
-
-    Argument : unsigned long
-    Return   : void
-
-A platform may need stack memory that is coherent with main memory to perform
-certain operations like:
-
-*   Turning the MMU on, or
-*   Flushing caches prior to powering down a CPU or cluster.
-
-Each BL stage allocates this coherent stack memory for each CPU in the
-`tzfw_coherent_mem` section.
-
-This function sets the current stack pointer to the coherent stack that
-has been allocated for the CPU specified by MPIDR. For BL images that only
-require a stack for the primary CPU the parameter is ignored. The size of
-the stack allocated to each CPU is specified by the platform defined constant
-`PCPU_DV_MEM_STACK_SIZE`.
-
-Common implementations of this function for the UP and MP BL images are
-provided in [plat/common/aarch64/platform_up_stack.S] and
-[plat/common/aarch64/platform_mp_stack.S]
-
-
 ### Function : platform_is_primary_cpu()
 
     Argument : unsigned long
@@ -1116,11 +1085,6 @@ the calling CPU is the last powered on CPU in the cluster, after powering down
 affinity level 0 (CPU), the platform port should power down affinity level 1
 (the cluster) as well.
 
-This function is called with coherent stacks. This allows the PSCI
-implementation to flush caches at a given affinity level without running into
-stale stack state after turning off the caches. On ARMv8-A cache hits do not
-occur after the cache has been turned off.
-
 #### plat_pm_ops.affinst_suspend()
 
 Perform the platform specific setup to power off an affinity instance in the
@@ -1143,11 +1107,6 @@ case, the affinity instance is expected to save enough state so that it can
 resume execution by restoring this state when its powered on (see
 `affinst_suspend_finish()`).
 
-This function is called with coherent stacks. This allows the PSCI
-implementation to flush caches at a given affinity level without running into
-stale stack state after turning off the caches. On ARMv8-A cache hits do not
-occur after the cache has been turned off.
-
 #### plat_pm_ops.affinst_on_finish()
 
 This function is called by the PSCI implementation after the calling CPU is
@@ -1159,11 +1118,6 @@ services.
 The `MPIDR` (first argument), `affinity level` (second argument) and `state`
 (third argument) have a similar meaning as described in the previous operations.
 
-This function is called with coherent stacks. This allows the PSCI
-implementation to flush caches at a given affinity level without running into
-stale stack state after turning off the caches. On ARMv8-A cache hits do not
-occur after the cache has been turned off.
-
 #### plat_pm_ops.affinst_on_suspend()
 
 This function is called by the PSCI implementation after the calling CPU is
@@ -1176,11 +1130,6 @@ and also provide secure runtime firmware services.
 The `MPIDR` (first argument), `affinity level` (second argument) and `state`
 (third argument) have a similar meaning as described in the previous operations.
 
-This function is called with coherent stacks. This allows the PSCI
-implementation to flush caches at a given affinity level without running into
-stale stack state after turning off the caches. On ARMv8-A cache hits do not
-occur after the cache has been turned off.
-
 BL3-1 platform initialization code must also detect the system topology and
 the state of each affinity instance in the topology. This information is
 critical for the PSCI runtime service to function correctly. More details are
diff --git a/include/bl31/context.h b/include/bl31/context.h
index 82d0c9ce7..3bf498065 100644
--- a/include/bl31/context.h
+++ b/include/bl31/context.h
@@ -76,21 +76,13 @@
  * 32-bits wide but are stored as 64-bit values for convenience
  ******************************************************************************/
 #define CTX_EL3STATE_OFFSET	(CTX_GPREGS_OFFSET + CTX_GPREGS_END)
-#define CTX_VBAR_EL3	0x0		/* Currently unused */
+#define CTX_SCR_EL3		0x0
 #define CTX_RUNTIME_SP		0x8
 #define CTX_SPSR_EL3		0x10
 #define CTX_ELR_EL3		0x18
-#define CTX_SCR_EL3		0x20
-#define CTX_SCTLR_EL3		0x28
-#define CTX_CPTR_EL3		0x30
-/* Unused space to allow registers to be stored as pairs */
-#define CTX_CNTFRQ_EL0		0x40
-#define CTX_MAIR_EL3		0x48
-#define CTX_TCR_EL3		0x50
-#define CTX_TTBR0_EL3		0x58
-#define CTX_DAIF_EL3		0x60
-/* Unused space to honour alignment requirements */
-#define CTX_EL3STATE_END	0x70
+#define CTX_CPTR_EL3		0x20
+#define CTX_CNTFRQ_EL0		0x28
+#define CTX_EL3STATE_END	0x30
 
 /*******************************************************************************
  * Constants that allow assembler code to access members of and the
diff --git a/include/lib/aarch64/xlat_tables.h b/include/lib/aarch64/xlat_tables.h
index 8e0adc7fa..2d4a211b9 100644
--- a/include/lib/aarch64/xlat_tables.h
+++ b/include/lib/aarch64/xlat_tables.h
@@ -31,6 +31,14 @@
 #ifndef __XLAT_TABLES_H__
 #define __XLAT_TABLES_H__
 
+
+/*
+ * Flags to override default values used to program system registers while
+ * enabling the MMU.
+ */
+#define DISABLE_DCACHE		(1 << 0)
+
+#ifndef __ASSEMBLY__
 #include <stdint.h>
 
 /*
@@ -67,7 +75,8 @@ void mmap_add(const mmap_region_t *mm);
 
 void init_xlat_tables(void);
 
-void enable_mmu_el1(void);
-void enable_mmu_el3(void);
+void enable_mmu_el1(uint32_t flags);
+void enable_mmu_el3(uint32_t flags);
 
+#endif /*__ASSEMBLY__*/
 #endif /* __XLAT_TABLES_H__ */
diff --git a/include/plat/common/platform.h b/include/plat/common/platform.h
index 1eeaac278..4b73a0971 100644
--- a/include/plat/common/platform.h
+++ b/include/plat/common/platform.h
@@ -180,7 +180,7 @@ unsigned int plat_get_aff_state(unsigned int, unsigned long);
 /*******************************************************************************
  * Optional BL3-1 functions (may be overridden)
  ******************************************************************************/
-void bl31_plat_enable_mmu(void);
+void bl31_plat_enable_mmu(uint32_t flags);
 
 /*******************************************************************************
  * Mandatory BL3-2 functions (only if platform contains a BL3-2)
@@ -190,6 +190,6 @@ void bl32_platform_setup(void);
 /*******************************************************************************
  * Optional BL3-2 functions (may be overridden)
  ******************************************************************************/
-void bl32_plat_enable_mmu(void);
+void bl32_plat_enable_mmu(uint32_t flags);
 
 #endif /* __PLATFORM_H__ */
diff --git a/lib/aarch64/xlat_tables.c b/lib/aarch64/xlat_tables.c
index f1d658d52..d49411297 100644
--- a/lib/aarch64/xlat_tables.c
+++ b/lib/aarch64/xlat_tables.c
@@ -292,7 +292,7 @@ void init_xlat_tables(void)
  *			exception level
  ******************************************************************************/
 #define DEFINE_ENABLE_MMU_EL(_el, _tcr_extra, _tlbi_fct)		\
-	void enable_mmu_el##_el(void)					\
+	void enable_mmu_el##_el(uint32_t flags)				\
 	{								\
 		uint64_t mair, tcr, ttbr;				\
 		uint32_t sctlr;						\
@@ -330,7 +330,13 @@ void init_xlat_tables(void)
 									\
 		sctlr = read_sctlr_el##_el();				\
 		sctlr |= SCTLR_WXN_BIT | SCTLR_M_BIT | SCTLR_I_BIT;	\
-		sctlr |= SCTLR_A_BIT | SCTLR_C_BIT;			\
+		sctlr |= SCTLR_A_BIT;					\
+									\
+		if (flags & DISABLE_DCACHE)				\
+			sctlr &= ~SCTLR_C_BIT;				\
+		else							\
+			sctlr |= SCTLR_C_BIT;				\
+									\
 		write_sctlr_el##_el(sctlr);				\
 									\
 		/* Ensure the MMU enable takes effect immediately */	\
diff --git a/plat/common/aarch64/plat_common.c b/plat/common/aarch64/plat_common.c
index 94b9dfdf0..90574fd66 100644
--- a/plat/common/aarch64/plat_common.c
+++ b/plat/common/aarch64/plat_common.c
@@ -38,12 +38,12 @@
 #pragma weak bl31_plat_enable_mmu
 #pragma weak bl32_plat_enable_mmu
 
-void bl31_plat_enable_mmu(void)
+void bl31_plat_enable_mmu(uint32_t flags)
 {
-	enable_mmu_el3();
+	enable_mmu_el3(flags);
 }
 
-void bl32_plat_enable_mmu(void)
+void bl32_plat_enable_mmu(uint32_t flags)
 {
-	enable_mmu_el1();
+	enable_mmu_el1(flags);
 }
diff --git a/plat/common/aarch64/platform_mp_stack.S b/plat/common/aarch64/platform_mp_stack.S
index 801ec7fab..8eb1aa689 100644
--- a/plat/common/aarch64/platform_mp_stack.S
+++ b/plat/common/aarch64/platform_mp_stack.S
@@ -33,28 +33,11 @@
 #include <platform_def.h>
 
 
-	.local	pcpu_dv_mem_stack
 	.local	platform_normal_stacks
 	.weak	platform_set_stack
 	.weak	platform_get_stack
-	.weak	platform_set_coherent_stack
 
 
-	/* -----------------------------------------------------
-	 * void platform_set_coherent_stack (unsigned long mpidr)
-	 *
-	 * For a given CPU, this function sets the stack pointer
-	 * to a stack allocated in device memory. This stack can
-	 * be used by C code which enables/disables the SCTLR.M
-	 * SCTLR.C bit e.g. while powering down a cpu
-	 * -----------------------------------------------------
-	 */
-func platform_set_coherent_stack
-	mov x5, x30 // lr
-	get_mp_stack pcpu_dv_mem_stack, PCPU_DV_MEM_STACK_SIZE
-	mov sp, x0
-	ret x5
-
 	/* -----------------------------------------------------
 	 * unsigned long platform_get_stack (unsigned long mpidr)
 	 *
@@ -81,22 +64,9 @@ func platform_set_stack
 	ret x9
 
 	/* -----------------------------------------------------
-	 * Per-cpu stacks in normal memory.
-	 * Used for C code during runtime execution (when coherent
-	 * stacks are not required).
-	 * Each cpu gets a stack of PLATFORM_STACK_SIZE bytes.
+	 * Per-cpu stacks in normal memory. Each cpu gets a
+	 * stack of PLATFORM_STACK_SIZE bytes.
 	 * -----------------------------------------------------
 	 */
 declare_stack platform_normal_stacks, tzfw_normal_stacks, \
 		PLATFORM_STACK_SIZE, PLATFORM_CORE_COUNT
-
-	/* -----------------------------------------------------
-	 * Per-cpu stacks in device memory.
-	 * Used for C code just before power down or right after
-	 * power up when the MMU or caches need to be turned on
-	 * or off.
-	 * Each cpu gets a stack of PCPU_DV_MEM_STACK_SIZE bytes.
-	 * -----------------------------------------------------
-	 */
-declare_stack pcpu_dv_mem_stack, tzfw_coherent_mem, \
-		PCPU_DV_MEM_STACK_SIZE, PLATFORM_CORE_COUNT
diff --git a/plat/common/aarch64/platform_up_stack.S b/plat/common/aarch64/platform_up_stack.S
index 45a96a646..73b74b2ee 100644
--- a/plat/common/aarch64/platform_up_stack.S
+++ b/plat/common/aarch64/platform_up_stack.S
@@ -33,26 +33,9 @@
 #include <platform_def.h>
 
 
-	.local	pcpu_dv_mem_stack
 	.local	platform_normal_stacks
 	.globl	platform_set_stack
 	.globl	platform_get_stack
-	.globl	platform_set_coherent_stack
-
-
-	/* -----------------------------------------------------
-	 * void platform_set_coherent_stack (unsigned long)
-	 *
-	 * For cold-boot BL images, only the primary CPU needs a
-	 * stack. This function sets the stack pointer to a stack
-	 * allocated in device memory.
-	 * -----------------------------------------------------
-	 */
-func platform_set_coherent_stack
-	get_up_stack pcpu_dv_mem_stack, PCPU_DV_MEM_STACK_SIZE
-	mov sp, x0
-	ret
-
 
 	/* -----------------------------------------------------
 	 * unsigned long platform_get_stack (unsigned long)
@@ -87,11 +70,3 @@ func platform_set_stack
 	 */
 declare_stack platform_normal_stacks, tzfw_normal_stacks, \
 		PLATFORM_STACK_SIZE, 1
-
-	/* -----------------------------------------------------
-	 * Single cpu stack in device/coherent memory.
-	 * PCPU_DV_MEM_STACK_SIZE bytes are allocated.
-	 * -----------------------------------------------------
-	 */
-declare_stack pcpu_dv_mem_stack, tzfw_coherent_mem, \
-		PCPU_DV_MEM_STACK_SIZE, 1
diff --git a/plat/fvp/aarch64/fvp_common.c b/plat/fvp/aarch64/fvp_common.c
index 3fe3a218f..392623971 100644
--- a/plat/fvp/aarch64/fvp_common.c
+++ b/plat/fvp/aarch64/fvp_common.c
@@ -119,7 +119,7 @@ const unsigned int num_sec_irqs = sizeof(irq_sec_array) /
 		mmap_add(fvp_mmap);					\
 		init_xlat_tables();					\
 									\
-		enable_mmu_el##_el();					\
+		enable_mmu_el##_el(0);					\
 	}
 
 /* Define EL1 and EL3 variants of the function initialising the MMU */
diff --git a/plat/fvp/fvp_pm.c b/plat/fvp/fvp_pm.c
index 55f465ba1..22e53e123 100644
--- a/plat/fvp/fvp_pm.c
+++ b/plat/fvp/fvp_pm.c
@@ -120,11 +120,10 @@ exit:
  * platform to decide whether the cluster is being turned off and take apt
  * actions.
  *
- * CAUTION: This function is called with coherent stacks so that caches can be
- * turned off, flushed and coherency disabled. There is no guarantee that caches
- * will remain turned on across calls to this function as each affinity level is
- * dealt with. So do not write & read global variables across calls. It will be
- * wise to do flush a write to the global to prevent unpredictable results.
+ * CAUTION: There is no guarantee that caches will remain turned on across calls
+ * to this function as each affinity level is dealt with. So do not write & read
+ * global variables across calls. It will be wise to do flush a write to the
+ * global to prevent unpredictable results.
  ******************************************************************************/
 int fvp_affinst_off(unsigned long mpidr,
 		    unsigned int afflvl,
@@ -192,11 +191,10 @@ int fvp_affinst_off(unsigned long mpidr,
  * platform to decide whether the cluster is being turned off and take apt
  * actions.
  *
- * CAUTION: This function is called with coherent stacks so that caches can be
- * turned off, flushed and coherency disabled. There is no guarantee that caches
- * will remain turned on across calls to this function as each affinity level is
- * dealt with. So do not write & read global variables across calls. It will be
- * wise to do flush a write to the global to prevent unpredictable results.
+ * CAUTION: There is no guarantee that caches will remain turned on across calls
+ * to this function as each affinity level is dealt with. So do not write & read
+ * global variables across calls. It will be wise to do flush a write to the
+ * global to prevent unpredictable results.
  ******************************************************************************/
 int fvp_affinst_suspend(unsigned long mpidr,
 			unsigned long sec_entrypoint,
diff --git a/plat/fvp/include/platform_def.h b/plat/fvp/include/platform_def.h
index ec4cf525e..998326630 100644
--- a/plat/fvp/include/platform_def.h
+++ b/plat/fvp/include/platform_def.h
@@ -47,13 +47,6 @@
 /* Size of cacheable stacks */
 #define PLATFORM_STACK_SIZE	0x800
 
-/* Size of coherent stacks for debug and release builds */
-#if DEBUG
-#define PCPU_DV_MEM_STACK_SIZE	0x400
-#else
-#define PCPU_DV_MEM_STACK_SIZE	0x300
-#endif
-
 #define FIRMWARE_WELCOME_STR		"Booting trusted firmware boot loader stage 1\n\r"
 
 /* Trusted Boot Firmware BL2 */
diff --git a/services/std_svc/psci/psci_afflvl_off.c b/services/std_svc/psci/psci_afflvl_off.c
index a8904e984..83d19d3e3 100644
--- a/services/std_svc/psci/psci_afflvl_off.c
+++ b/services/std_svc/psci/psci_afflvl_off.c
@@ -44,7 +44,6 @@ static int psci_afflvl0_off(aff_map_node_t *cpu_node)
 {
 	unsigned int plat_state;
 	int rc;
-	unsigned long sctlr;
 
 	assert(cpu_node->level == MPIDR_AFFLVL0);
 
@@ -70,24 +69,8 @@ static int psci_afflvl0_off(aff_map_node_t *cpu_node)
 	/*
 	 * Arch. management. Perform the necessary steps to flush all
 	 * cpu caches.
-	 *
-	 * TODO: This power down sequence varies across cpus so it needs to be
-	 * abstracted out on the basis of the MIDR like in cpu_reset_handler().
-	 * Do the bare minimal for the time being. Fix this before porting to
-	 * Cortex models.
 	 */
-	sctlr = read_sctlr_el3();
-	sctlr &= ~SCTLR_C_BIT;
-	write_sctlr_el3(sctlr);
-	isb();	/* ensure MMU disable takes immediate effect */
-
-	/*
-	 * CAUTION: This flush to the level of unification makes an assumption
-	 * about the cache hierarchy at affinity level 0 (cpu) in the platform.
-	 * Ideally the platform should tell psci which levels to flush to exit
-	 * coherency.
-	 */
-	dcsw_op_louis(DCCISW);
+	psci_do_pwrdown_cache_maintenance(MPIDR_AFFLVL0);
 
 	/*
 	 * Plat. management: Perform platform specific actions to turn this
@@ -227,9 +210,6 @@ static int psci_call_off_handlers(mpidr_aff_map_nodes_t mpidr_nodes,
  * the lowest to the highest affinity level implemented by the platform because
  * to turn off affinity level X it is neccesary to turn off affinity level X - 1
  * first.
- *
- * CAUTION: This function is called with coherent stacks so that coherency can
- * be turned off and caches can be flushed safely.
  ******************************************************************************/
 int psci_afflvl_off(int start_afflvl,
 		    int end_afflvl)
diff --git a/services/std_svc/psci/psci_afflvl_on.c b/services/std_svc/psci/psci_afflvl_on.c
index d62017245..3b7d80545 100644
--- a/services/std_svc/psci/psci_afflvl_on.c
+++ b/services/std_svc/psci/psci_afflvl_on.c
@@ -359,9 +359,9 @@ static unsigned int psci_afflvl0_on_finish(aff_map_node_t *cpu_node)
 	}
 
 	/*
-	 * Arch. management: Turn on mmu & restore architectural state
+	 * Arch. management: Enable data cache and manage stack memory
 	 */
-	bl31_plat_enable_mmu();
+	psci_do_pwrup_cache_maintenance();
 
 	/*
 	 * All the platform specific actions for turning this cpu
diff --git a/services/std_svc/psci/psci_afflvl_suspend.c b/services/std_svc/psci/psci_afflvl_suspend.c
index 097719830..1e60276c0 100644
--- a/services/std_svc/psci/psci_afflvl_suspend.c
+++ b/services/std_svc/psci/psci_afflvl_suspend.c
@@ -126,8 +126,7 @@ static int psci_afflvl0_suspend(aff_map_node_t *cpu_node,
 				unsigned int power_state)
 {
 	unsigned int plat_state;
-	unsigned long psci_entrypoint, sctlr;
-	el3_state_t *saved_el3_state;
+	unsigned long psci_entrypoint;
 	uint32_t ns_scr_el3 = read_scr_el3();
 	uint32_t ns_sctlr_el1 = read_sctlr_el1();
 	int rc;
@@ -170,37 +169,14 @@ static int psci_afflvl0_suspend(aff_map_node_t *cpu_node,
 	 */
 	cm_el3_sysregs_context_save(NON_SECURE);
 
-	/*
-	 * The EL3 state to PoC since it will be accessed after a
-	 * reset with the caches turned off
-	 */
-	saved_el3_state = get_el3state_ctx(cm_get_context(NON_SECURE));
-	flush_dcache_range((uint64_t) saved_el3_state, sizeof(*saved_el3_state));
-
 	/* Set the secure world (EL3) re-entry point after BL1 */
 	psci_entrypoint = (unsigned long) psci_aff_suspend_finish_entry;
 
 	/*
 	 * Arch. management. Perform the necessary steps to flush all
 	 * cpu caches.
-	 *
-	 * TODO: This power down sequence varies across cpus so it needs to be
-	 * abstracted out on the basis of the MIDR like in cpu_reset_handler().
-	 * Do the bare minimal for the time being. Fix this before porting to
-	 * Cortex models.
 	 */
-	sctlr = read_sctlr_el3();
-	sctlr &= ~SCTLR_C_BIT;
-	write_sctlr_el3(sctlr);
-	isb();	/* ensure MMU disable takes immediate effect */
-
-	/*
-	 * CAUTION: This flush to the level of unification makes an assumption
-	 * about the cache hierarchy at affinity level 0 (cpu) in the platform.
-	 * Ideally the platform should tell psci which levels to flush to exit
-	 * coherency.
-	 */
-	dcsw_op_louis(DCCISW);
+	psci_do_pwrdown_cache_maintenance(MPIDR_AFFLVL0);
 
 	/*
 	 * Plat. management: Allow the platform to perform the
@@ -379,9 +355,6 @@ static int psci_call_suspend_handlers(mpidr_aff_map_nodes_t mpidr_nodes,
  * the lowest to the highest affinity level implemented by the platform because
  * to turn off affinity level X it is neccesary to turn off affinity level X - 1
  * first.
- *
- * CAUTION: This function is called with coherent stacks so that coherency can
- * be turned off and caches can be flushed safely.
  ******************************************************************************/
 int psci_afflvl_suspend(unsigned long entrypoint,
 			unsigned long context_id,
@@ -467,9 +440,11 @@ static unsigned int psci_afflvl0_suspend_finish(aff_map_node_t *cpu_node)
 
 	/* Get the index for restoring the re-entry information */
 	/*
-	 * Arch. management: Restore the stashed EL3 architectural
-	 * context from the 'cpu_context' structure for this cpu.
+	 * Arch. management: Enable the data cache, manage stack memory and
+	 * restore the stashed EL3 architectural context from the 'cpu_context'
+	 * structure for this cpu.
 	 */
+	psci_do_pwrup_cache_maintenance();
 	cm_el3_sysregs_context_restore(NON_SECURE);
 
 	/*
@@ -575,4 +550,3 @@ const afflvl_power_on_finisher_t psci_afflvl_suspend_finishers[] = {
 	psci_afflvl1_suspend_finish,
 	psci_afflvl2_suspend_finish,
 };
-
diff --git a/services/std_svc/psci/psci_common.c b/services/std_svc/psci/psci_common.c
index 3c79a5e7a..56f3daf27 100644
--- a/services/std_svc/psci/psci_common.c
+++ b/services/std_svc/psci/psci_common.c
@@ -390,9 +390,6 @@ static int psci_call_power_on_handlers(mpidr_aff_map_nodes_t mpidr_nodes,
  * the highest to the lowest affinity level implemented by the platform because
  * to turn on affinity level X it is neccesary to turn on affinity level X + 1
  * first.
- *
- * CAUTION: This function is called with coherent stacks so that coherency and
- * the mmu can be turned on safely.
  ******************************************************************************/
 void psci_afflvl_power_on_finish(int start_afflvl,
 				 int end_afflvl,
diff --git a/services/std_svc/psci/psci_entry.S b/services/std_svc/psci/psci_entry.S
index 1ffde069f..192b638c1 100644
--- a/services/std_svc/psci/psci_entry.S
+++ b/services/std_svc/psci/psci_entry.S
@@ -31,6 +31,7 @@
 #include <arch.h>
 #include <asm_macros.S>
 #include <psci.h>
+#include <xlat_tables.h>
 
 	.globl	psci_aff_on_finish_entry
 	.globl	psci_aff_suspend_finish_entry
@@ -43,11 +44,6 @@
 	 * upon whether it was resumed from suspend or simply
 	 * turned on, call the common power on finisher with
 	 * the handlers (chosen depending upon original state).
-	 * For ease, the finisher is called with coherent
-	 * stacks. This allows the cluster/cpu finishers to
-	 * enter coherency and enable the mmu without running
-	 * into issues. We switch back to normal stacks once
-	 * all this is done.
 	 * -----------------------------------------------------
 	 */
 func psci_aff_on_finish_entry
@@ -78,8 +74,34 @@ psci_aff_common_finish_entry:
 	 */
 	msr	spsel, #0
 
+	/* --------------------------------------------
+	 * Give ourselves a stack whose memory will be
+	 * marked as Normal-IS-WBWA when the MMU is
+	 * enabled.
+	 * --------------------------------------------
+	 */
 	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
+	bl	platform_set_stack
+
+	/* --------------------------------------------
+	 * Enable the MMU with the DCache disabled. It
+	 * is safe to use stacks allocated in normal
+	 * memory as a result. All memory accesses are
+	 * marked nGnRnE when the MMU is disabled. So
+	 * all the stack writes will make it to memory.
+	 * All memory accesses are marked Non-cacheable
+	 * when the MMU is enabled but D$ is disabled.
+	 * So used stack memory is guaranteed to be
+	 * visible immediately after the MMU is enabled
+	 * Enabling the DCache at the same time as the
+	 * MMU can lead to speculatively fetched and
+	 * possibly stale stack memory being read from
+	 * other caches. This can lead to coherency
+	 * issues.
+	 * --------------------------------------------
+	 */
+	mov	x0, #DISABLE_DCACHE
+	bl	bl31_plat_enable_mmu
 
 	/* ---------------------------------------------
 	 * Call the finishers starting from affinity
@@ -95,60 +117,10 @@ psci_aff_common_finish_entry:
 	mov	x0, #MPIDR_AFFLVL0
 	bl	psci_afflvl_power_on_finish
 
-	/* --------------------------------------------
-	 * Give ourselves a stack allocated in Normal
-	 * -IS-WBWA memory
-	 * --------------------------------------------
-	 */
-	mrs	x0, mpidr_el1
-	bl	platform_set_stack
-
 	b	el3_exit
 _panic:
 	b	_panic
 
-	/* -----------------------------------------------------
-	 * The following two stubs give the calling cpu a
-	 * coherent stack to allow flushing of caches without
-	 * suffering from stack coherency issues
-	 * -----------------------------------------------------
-	 */
-func __psci_cpu_off
-	func_prologue
-	sub	sp, sp, #0x10
-	stp	x19, x20, [sp, #0]
-	mov	x19, sp
-	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
-	bl	psci_cpu_off
-	mov	sp, x19
-	ldp	x19, x20, [sp,#0]
-	add	sp, sp, #0x10
-	func_epilogue
-	ret
-
-func __psci_cpu_suspend
-	func_prologue
-	sub	sp, sp, #0x20
-	stp	x19, x20, [sp, #0]
-	stp	x21, x22, [sp, #0x10]
-	mov	x19, sp
-	mov	x20, x0
-	mov	x21, x1
-	mov	x22, x2
-	mrs	x0, mpidr_el1
-	bl	platform_set_coherent_stack
-	mov	x0, x20
-	mov	x1, x21
-	mov	x2, x22
-	bl	psci_cpu_suspend
-	mov	sp, x19
-	ldp	x21, x22, [sp,#0x10]
-	ldp	x19, x20, [sp,#0]
-	add	sp, sp, #0x20
-	func_epilogue
-	ret
-
 	/* --------------------------------------------
 	 * This function is called to indicate to the
 	 * power controller that it is safe to power
diff --git a/services/std_svc/psci/psci_helpers.S b/services/std_svc/psci/psci_helpers.S
new file mode 100644
index 000000000..21b5688ce
--- /dev/null
+++ b/services/std_svc/psci/psci_helpers.S
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch.h>
+#include <asm_macros.S>
+#include <platform_def.h>
+
+	.globl	psci_do_pwrdown_cache_maintenance
+	.globl	psci_do_pwrup_cache_maintenance
+
+/* -----------------------------------------------------------------------
+ * void psci_do_pwrdown_cache_maintenance(uint32_t affinity level);
+ *
+ * This function performs cache maintenance before this cpu is powered
+ * off. The levels of cache affected are determined by the affinity level
+ * which is passed as the argument. Additionally, this function also
+ * ensures that stack memory is correctly flushed out to avoid coherency
+ * issues due to a change in its memory attributes after the data cache
+ * is disabled.
+ * -----------------------------------------------------------------------
+ */
+func psci_do_pwrdown_cache_maintenance
+	stp     x29, x30, [sp,#-16]!
+	stp     x19, x20, [sp,#-16]!
+
+	/* ---------------------------------------------
+	 * Disable the Data Cache.
+	 * ---------------------------------------------
+	 */
+	mrs	x1, sctlr_el3
+	bic	x1, x1, #SCTLR_C_BIT
+	msr	sctlr_el3, x1
+	isb
+
+	/* ---------------------------------------------
+	 * Determine to how many levels of cache will be
+	 * subject to cache maintenance. Affinity level
+	 * 0 implies that only the cpu is being powered
+	 * down. Only the L1 data cache needs to be
+	 * flushed to the PoU in this case. For a higher
+	 * affinity level we are assuming that a flush
+	 * of L1 data and L2 unified cache is enough.
+	 * This information should be provided by the
+	 * platform.
+	 * ---------------------------------------------
+	 */
+	cmp	x0, #MPIDR_AFFLVL0
+	mov	x0, #DCCISW
+	b.ne	flush_caches_to_poc
+
+	/* ---------------------------------------------
+	 * Flush L1 cache to PoU.
+	 * ---------------------------------------------
+	 */
+	bl	dcsw_op_louis
+	b	do_stack_maintenance
+
+	/* ---------------------------------------------
+	 * Flush L1 and L2 caches to PoC.
+	 * ---------------------------------------------
+	 */
+flush_caches_to_poc:
+	bl	dcsw_op_all
+
+	/* ---------------------------------------------
+	 * TODO: Intra-cluster coherency should be
+	 * turned off here once cpu-specific
+	 * abstractions are in place.
+	 * ---------------------------------------------
+	 */
+
+	/* ---------------------------------------------
+	 * Do stack maintenance by flushing the used
+	 * stack to the main memory and invalidating the
+	 * remainder.
+	 * ---------------------------------------------
+	 */
+do_stack_maintenance:
+	mrs	x0, mpidr_el1
+	bl	platform_get_stack
+
+	/* ---------------------------------------------
+	 * Calculate and store the size of the used
+	 * stack memory in x1.
+	 * ---------------------------------------------
+	 */
+	mov	x19, x0
+	mov	x1, sp
+	sub	x1, x0, x1
+	mov	x0, sp
+	bl	flush_dcache_range
+
+	/* ---------------------------------------------
+	 * Calculate and store the size of the unused
+	 * stack memory in x1. Calculate and store the
+	 * stack base address in x0.
+	 * ---------------------------------------------
+	 */
+	sub	x0, x19, #PLATFORM_STACK_SIZE
+	sub	x1, sp, x0
+	bl	inv_dcache_range
+
+	ldp	x19, x20, [sp], #16
+	ldp	x29, x30, [sp], #16
+	ret
+
+
+/* -----------------------------------------------------------------------
+ * void psci_do_pwrup_cache_maintenance(void);
+ *
+ * This function performs cache maintenance after this cpu is powered up.
+ * Currently, this involves managing the used stack memory before turning
+ * on the data cache.
+ * -----------------------------------------------------------------------
+ */
+func psci_do_pwrup_cache_maintenance
+	stp	x29, x30, [sp,#-16]!
+
+	/* ---------------------------------------------
+	 * Ensure any inflight stack writes have made it
+	 * to main memory.
+	 * ---------------------------------------------
+	 */
+	dmb	st
+
+	/* ---------------------------------------------
+	 * Calculate and store the size of the used
+	 * stack memory in x1. Calculate and store the
+	 * stack base address in x0.
+	 * ---------------------------------------------
+	 */
+	mrs	x0, mpidr_el1
+	bl	platform_get_stack
+	mov	x1, sp
+	sub	x1, x0, x1
+	mov	x0, sp
+	bl	inv_dcache_range
+
+	/* ---------------------------------------------
+	 * Enable the data cache.
+	 * ---------------------------------------------
+	 */
+	mrs	x0, sctlr_el3
+	orr	x0, x0, #SCTLR_C_BIT
+	msr	sctlr_el3, x0
+	isb
+
+	ldp	x29, x30, [sp], #16
+	ret
diff --git a/services/std_svc/psci/psci_main.c b/services/std_svc/psci/psci_main.c
index d68f3d0f8..21968d9bf 100644
--- a/services/std_svc/psci/psci_main.c
+++ b/services/std_svc/psci/psci_main.c
@@ -230,10 +230,10 @@ uint64_t psci_smc_handler(uint32_t smc_fid,
 			SMC_RET1(handle, psci_version());
 
 		case PSCI_CPU_OFF:
-			SMC_RET1(handle, __psci_cpu_off());
+			SMC_RET1(handle, psci_cpu_off());
 
 		case PSCI_CPU_SUSPEND_AARCH32:
-			SMC_RET1(handle, __psci_cpu_suspend(x1, x2, x3));
+			SMC_RET1(handle, psci_cpu_suspend(x1, x2, x3));
 
 		case PSCI_CPU_ON_AARCH32:
 			SMC_RET1(handle, psci_cpu_on(x1, x2, x3));
@@ -258,7 +258,7 @@ uint64_t psci_smc_handler(uint32_t smc_fid,
 
 		switch (smc_fid) {
 		case PSCI_CPU_SUSPEND_AARCH64:
-			SMC_RET1(handle, __psci_cpu_suspend(x1, x2, x3));
+			SMC_RET1(handle, psci_cpu_suspend(x1, x2, x3));
 
 		case PSCI_CPU_ON_AARCH64:
 			SMC_RET1(handle, psci_cpu_on(x1, x2, x3));
diff --git a/services/std_svc/psci/psci_private.h b/services/std_svc/psci/psci_private.h
index 06db63f15..b47bf8591 100644
--- a/services/std_svc/psci/psci_private.h
+++ b/services/std_svc/psci/psci_private.h
@@ -128,5 +128,8 @@ int psci_afflvl_suspend(unsigned long,
 			int);
 unsigned int psci_afflvl_suspend_finish(int, int);
 
+/* Private exported functions from psci_helpers.S */
+void psci_do_pwrdown_cache_maintenance(uint32_t affinity_level);
+void psci_do_pwrup_cache_maintenance(void);
 
 #endif /* __PSCI_PRIVATE_H__ */