[patch] MCA recovery: user errors surfacing in kernel context

From: Russ Anderson <rja_at_efs.americas.sgi.com>
Date: 2005-11-12 08:42:24
[patch] MCA recovery: user errors surfacing in kernel context

Memory errors encountered by user applications may surface
when the CPU is running in kernel context.  An example is
a user process lauching a load of memory with bad ECC, but
an interrupt comes in before the MCA surfaces.  Since the CPU 
is in privilaged mode, the current code will assume the error 
is a kernel error and not recover.  This patch adds a check 
for cases where the user initiated the load that surfaces in 
kernel interrupt code.

Signed-off-by: Russ Anderson (rja@sgi.com)

--------------------------------------------------------------
 arch/ia64/kernel/mca_drv.c     |   19 +++++++++++++------
 arch/ia64/kernel/mca_drv.h     |    7 +++++++
 arch/ia64/kernel/mca_drv_asm.S |    6 ++++--
 3 files changed, 24 insertions(+), 8 deletions(-)

Index: test/arch/ia64/kernel/mca_drv.c
===================================================================
--- test.orig/arch/ia64/kernel/mca_drv.c	2005-11-08 16:14:23.925602126 -0600
+++ test/arch/ia64/kernel/mca_drv.c	2005-11-09 18:26:37.323328530 -0600
@@ -121,10 +121,12 @@ mca_page_isolate(unsigned long paddr)
  */
 
 void
-mca_handler_bh(unsigned long paddr)
+mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr)
 {
-	printk(KERN_DEBUG "OS_MCA: process [pid: %d](%s) encounters MCA.\n",
-		current->pid, current->comm);
+	printk(KERN_DEBUG "OS_MCA: process [cpu %d, pid: %d, uid: %d, "
+		"iip: %p, psr: 0x%lx,paddr: 0x%lx](%s) encounters MCA.\n",
+		raw_smp_processor_id(), current->pid, current->uid,
+		iip, ipsr, paddr, current->comm);
 
 	spin_lock(&mca_bh_lock);
 	switch (mca_page_isolate(paddr)) {
@@ -438,21 +440,25 @@ recover_from_read_error(slidx_table_t *s
 	 */
 
 	psr1 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_ipsr);
+	psr2 =(struct ia64_psr *)&(peidx_minstate_area(peidx)->pmsa_xpsr);
 
 	/*
 	 *  Check the privilege level of interrupted context.
 	 *   If it is user-mode, then terminate affected process.
 	 */
-	if (psr1->cpl != 0) {
+
+	pmsa = sos->pal_min_state;
+	if (psr1->cpl != 0 || ((psr2->cpl != 0) && in_interrupt_code(pmsa->pmsa_iip))) {
 		smei = peidx_bus_check(peidx, 0);
 		if (smei->valid.target_identifier) {
 			/*
 			 *  setup for resume to bottom half of MCA,
 			 * "mca_handler_bhhook"
 			 */
-			pmsa = sos->pal_min_state;
-			/* pass to bhhook as 1st argument (gr8) */
+			/* pass to bhhook as argument (gr8, ...) */
 			pmsa->pmsa_gr[8-1] = smei->target_identifier;
+			pmsa->pmsa_gr[9-1] = pmsa->pmsa_iip;
+			pmsa->pmsa_gr[10-1] = pmsa->pmsa_ipsr;
 			/* set interrupted return address (but no use) */
 			pmsa->pmsa_br0 = pmsa->pmsa_iip;
 			/* change resume address to bottom half */
@@ -462,6 +468,7 @@ recover_from_read_error(slidx_table_t *s
 			psr2 = (struct ia64_psr *)&pmsa->pmsa_ipsr;
 			psr2->cpl = 0;
 			psr2->ri  = 0;
+			psr2->bn  = 1;
 			psr2->i  = 0;
 
 			return 1;
Index: test/arch/ia64/kernel/mca_drv.h
===================================================================
--- test.orig/arch/ia64/kernel/mca_drv.h	2005-11-08 16:14:23.924625661 -0600
+++ test/arch/ia64/kernel/mca_drv.h	2005-11-09 19:24:16.218162450 -0600
@@ -111,3 +111,10 @@ typedef struct slidx_table {
 	slidx_foreach_entry(__pos, &((slidx)->sec)) { __count++; }\
 	__count; })
 
+/* Returns non-zero if the PC is in the Interrupt Vector Table */
+static __inline__ int in_interrupt_code(unsigned long pc)
+{
+	extern char ia64_ivt[];
+	return (pc >= (u_long)ia64_ivt && pc < (u_long)ia64_ivt+32768);
+}
+
Index: test/arch/ia64/kernel/mca_drv_asm.S
===================================================================
--- test.orig/arch/ia64/kernel/mca_drv_asm.S	2005-11-08 16:14:23.924625661 -0600
+++ test/arch/ia64/kernel/mca_drv_asm.S	2005-11-08 16:14:53.228349917 -0600
@@ -19,7 +19,7 @@ GLOBAL_ENTRY(mca_handler_bhhook)
 	;;
 	clrrrb
 	;;						
-	alloc	r16=ar.pfs,0,2,1,0	// make a new frame
+	alloc	r16=ar.pfs,0,2,3,0	// make a new frame
 	;;
 	mov	ar.rsc=0
 	;;
@@ -40,11 +40,13 @@ GLOBAL_ENTRY(mca_handler_bhhook)
 	movl	loc1=mca_handler_bh	// recovery C function
 	;;
 	mov	out0=r8			// poisoned address
+	mov	out1=r9			// iip
+	mov	out2=r10		// psr
 	mov	b6=loc1
 	;;
 	mov	loc1=rp
 	;;
-	ssm	psr.i
+	ssm	psr.i | psr.ic
 	;;
 	br.call.sptk.many rp=b6		// does not return ...
 	;;

-- 
Russ Anderson, OS RAS/Partitioning Project Lead  
SGI - Silicon Graphics Inc          rja@sgi.com
-
To unsubscribe from this list: send the line "unsubscribe linux-ia64" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Received on Sat Nov 12 08:43:54 2005

This archive was generated by hypermail 2.1.8 : 2005-11-12 08:44:01 EST