Bug 75757

Summary: Patch to fix MCA related issues in 7.2 2.4.9-34 ia64 kernel
Product: [Retired] Red Hat Linux Reporter: Dale Busacker <dale.l.busacker>
Component: kernelAssignee: Arjan van de Ven <arjanv>
Status: CLOSED WONTFIX QA Contact: Brian Brock <bbrock>
Severity: medium Docs Contact:
Priority: medium    
Version: 7.2   
Target Milestone: ---   
Target Release: ---   
Hardware: ia64   
OS: Linux   
Whiteboard:
Fixed In Version: Doc Type: Bug Fix
Doc Text:
Story Points: ---
Clone Of: Environment:
Last Closed: 2003-04-05 16:35:56 UTC Type: ---
Regression: --- Mount Type: ---
Documentation: --- CRM:
Verified Versions: Category: ---
oVirt Team: --- RHEL 7.3 requirements from Atomic Host:
Cloudforms Team: --- Target Upstream Version:
Embargoed:
Attachments:
Description Flags
patch for MCA related issues in 2.4.9-34 errata kernel none

Description Dale Busacker 2002-10-11 21:25:34 UTC
From Bugzilla Helper:
User-Agent: Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)

Description of problem:
- date printing patch from Matt Wilcox
- nested MCA psr.mc bit deposit patch
- fatal error log clearing patch
- PAL min-state save area patch

Version-Release number of selected component (if applicable):


How reproducible:
Always

Steps to Reproduce:
1. Requires special error injection tools
2.
3.
	

Additional info:

patch for 2.4.9-34 errata kernel:

--- arch/ia64/kernel/mca.c	Sat Jun  1 02:53:14 2002
+++ arch/ia64/kernel/mca.c	Fri Oct 11 11:36:10 2002
@@ -64,10 +64,10 @@
 u64				ia64_mca_bspstore[1024];
 u64				ia64_init_stack[INIT_TASK_SIZE] __attribute__
((aligned(16)));
 u64				ia64_mca_sal_data_area[1356];
-u64				ia64_mca_min_state_save_info;
 u64				ia64_tlb_functional;
 u64				ia64_os_mca_recovery_successful;
-
+/* TODO: need to assign min-state structure to UC memory */
+u64				ia64_mca_min_state_save_info
[MIN_STATE_AREA_SIZE] __attribute__((aligned(512)));
 static void			ia64_mca_wakeup_ipi_wait(void);
 static void			ia64_mca_wakeup(int cpu);
 static void			ia64_mca_wakeup_all(void);
@@ -111,7 +111,7 @@
  *  Outputs :   platform error status
  */
 int
-ia64_mca_log_sal_error_record(int sal_info_type)
+ia64_mca_log_sal_error_record(int sal_info_type, int called_from_init)
 {
 	int platform_err = 0;
 
@@ -126,9 +126,10 @@
 	 */
 
 	platform_err = ia64_log_print(sal_info_type, (prfunc_t)printk);
-	/* temporary:  only clear SAL logs on hardware-corrected errors */
-	if (sal_info_type > 1)
-	ia64_sal_clear_state_info(sal_info_type);
+	/* temporary: only clear SAL logs on hardware-corrected errors
+		or if we're logging an error after an MCA-initiated reboot */
+	if ((sal_info_type > 1) || (called_from_init))
+		ia64_sal_clear_state_info(sal_info_type);
 
 	return platform_err;
 }
@@ -149,7 +150,7 @@
 	IA64_MCA_DEBUG("ia64_mca_cpe_int_handler : received interrupt. vector = 
%#x\n", cpe_irq);
 
 	/* Get the CMC error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE);
+	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CPE, 0);
 }
 
 /*
@@ -202,7 +203,7 @@
 	/*
 	 *  If there is an MCA error record pending, get it and log it.
 	 */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
+	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA, 1);
 }
 
 /*
@@ -254,15 +255,23 @@
  *
  */
 void
-ia64_process_min_state_save (pal_min_state_area_t *pmss, struct pt_regs 
*ptregs)
+ia64_process_min_state_save (pal_min_state_area_t *pmss)
 {
-	int i, max=57;
-	u64 *tpmss_ptr=(u64 *)pmss;
+	int i, max = MIN_STATE_AREA_SIZE;
+	u64 *tpmss_ptr = (u64 *)pmss;
+	u64 *return_min_state_ptr = ia64_mca_min_state_save_info;
 
 	/* dump out the min_state_area information */
 
 	for (i=0;i<max;i++) {
 
+		/* copy min-state register info for eventual return to PAL */
+		*return_min_state_ptr++ = *tpmss_ptr;
+
+#if 1
+		tpmss_ptr++;  /* skip to next entry */
+#else
+		/* the printing part */
 		if(!ia64_pmss_dump_bank0) {
 			if(strncmp("B0",min_state_labels[i],2)==0) {
 				tpmss_ptr++;  /* skip to next entry */
@@ -275,6 +284,7 @@
 		if (((i+1)%3)==0 || ((!strcmp("GR16",min_state_labels[i]))
 				     && !ia64_pmss_dump_bank0))
 			printk("\n");
+#endif
 	}
 }
 
@@ -522,19 +532,17 @@
 	ia64_log_init(SAL_INFO_TYPE_CMC);
 	ia64_log_init(SAL_INFO_TYPE_CPE);
 
-	/* Zero the min state save info */
-	ia64_mca_min_state_save_info = 0;
-
 #if defined(MCA_TEST)
 	mca_test();
 #endif /* #if defined(MCA_TEST) */
 
 	printk("Mca related initialization done\n");
 
-#if 0   // Too early in initialization -- error log is lost
+	/* commented out because this is done elsewhere */
+#if 0
 	/* Do post-failure MCA error logging */
 	ia64_mca_check_errors();
-#endif  // Too early in initialization -- error log is lost
+#endif
 }
 
 /*
@@ -695,15 +703,13 @@
 
 	/* Cold Boot for uncorrectable MCA */
 	ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_COLD_BOOT;
-	// for CV test purposes:
-//	ia64_os_to_sal_handoff_state.imots_os_status = IA64_MCA_CORRECTED;
 
 	/* Default = tell SAL to return to same context */
 	ia64_os_to_sal_handoff_state.imots_context = IA64_MCA_SAME_CONTEXT;
 
 	/* Register pointer to new min state values */
-	/* NOTE: need to do something with this during recovery phase */
-	ia64_os_to_sal_handoff_state.imots_new_min_state = 
&ia64_mca_min_state_save_info;
+	ia64_os_to_sal_handoff_state.imots_new_min_state =
+		(pal_min_state_area_t *)ia64_mca_min_state_save_info;
 }
 
 /*
@@ -728,7 +734,7 @@
 	int platform_err = 0;
 
 	/* Get the MCA error record and log it */
-	platform_err = ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA);
+	platform_err = ia64_mca_log_sal_error_record(SAL_INFO_TYPE_MCA, 0);
 
 	/*
 	 *  Do Platform-specific mca error handling if required.
@@ -768,7 +774,7 @@
 		       cmc_irq, smp_processor_id());
 
 	/* Get the CMC error record and log it */
-	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC);
+	ia64_mca_log_sal_error_record(SAL_INFO_TYPE_CMC, 0);
 }
 
 /*
@@ -834,11 +840,10 @@
 	plog_ptr=(ia64_err_rec_t *)IA64_LOG_CURR_BUFFER(SAL_INFO_TYPE_INIT);
 	proc_ptr = &plog_ptr->proc_err;
 
-	ia64_process_min_state_save(&proc_ptr-
>processor_static_info.min_state_area,
-				    regs);
+	ia64_process_min_state_save(&proc_ptr-
>processor_static_info.min_state_area);
 
 	/* Clear the INIT SAL logs now that they have been saved in the OS 
buffer */
-//	ia64_sal_clear_state_info(SAL_INFO_TYPE_INIT);
+	ia64_sal_clear_state_info(SAL_INFO_TYPE_INIT);
 
 	init_handler_platform(regs);              /* call platform specific 
routines */
 }
@@ -1004,28 +1009,13 @@
 void
 ia64_log_rec_header_print (sal_log_record_header_t *lh, prfunc_t prfunc)
 {
-	char str_buf[32];
-
-	sprintf(str_buf, "%2d.%02d",
-		(lh->revision.major >> 4) * 10 + (lh->revision.major & 0xf),
-		(lh->revision.minor >> 4) * 10 + (lh->revision.minor & 0xf));
-	prfunc("+Err Record ID: %d    SAL Rev: %s\n", lh->id, str_buf);
-	sprintf(str_buf, "%02d/%02d/%04d/ %02d:%02d:%02d",
-		(lh->timestamp.slh_month >> 4) * 10 +
-		(lh->timestamp.slh_month & 0xf),
-		(lh->timestamp.slh_day >> 4) * 10 +
-		(lh->timestamp.slh_day & 0xf),
-		(lh->timestamp.slh_century >> 4) * 1000 +
-		(lh->timestamp.slh_century & 0xf) * 100 +
-		(lh->timestamp.slh_year >> 4) * 10 +
-		(lh->timestamp.slh_year & 0xf),
-		(lh->timestamp.slh_hour >> 4) * 10 +
-		(lh->timestamp.slh_hour & 0xf),
-		(lh->timestamp.slh_minute >> 4) * 10 +
-		(lh->timestamp.slh_minute & 0xf),
-		(lh->timestamp.slh_second >> 4) * 10 +
-		(lh->timestamp.slh_second & 0xf));
-	prfunc("+Time: %s    Severity %d\n", str_buf, lh->severity);
+	prfunc("+Err Record ID: %d    SAL Rev: %2x.%02x\n", lh->id,
+		lh->revision.major, lh->revision.minor);
+	prfunc("+Time: %02x/%02x/%02x%02x %02x:%02x:%02x    Severity %d\n",
+		lh->timestamp.slh_month, lh->timestamp.slh_day,
+		lh->timestamp.slh_century, lh->timestamp.slh_year,
+		lh->timestamp.slh_hour, lh->timestamp.slh_minute,
+		lh->timestamp.slh_second, lh->severity);
 }
 
 /*
@@ -1682,6 +1672,9 @@
 	if (slpi->valid.psi_static_struct) {
 		spsi = (sal_processor_static_info_t *)p_data;
 
+		/* copy interrupted context PAL min-state info */
+		ia64_process_min_state_save(&spsi->min_state_area);
+
 		/* Print branch register contents if valid */
 		if (spsi->valid.br)
 			ia64_log_processor_regs_print(spsi->br, 
8, "Branch", "br",
--- include/asm-ia64/mca.h	Sat Jun  1 03:16:29 2002
+++ include/asm-ia64/mca.h	Fri Oct 11 08:28:31 2002
@@ -105,6 +105,8 @@
 	IA64_MCA_NEW_CONTEXT	=	-1	/* SAL to return to new context 
*/
 };
 
+#define MIN_STATE_AREA_SIZE     57
+
 typedef struct ia64_mca_os_to_sal_state_s {
 	u64		imots_os_status;	/*   OS status to SAL as to 
what happened
 						 *   with the MCA handling.
--- include/asm-ia64/mca_asm.h	Sat Jun  1 02:53:14 2002
+++ include/asm-ia64/mca_asm.h	Wed Oct  9 13:45:38 2002
@@ -16,6 +16,7 @@
 #define PSR_I		14
 #define	PSR_DT		17
 #define PSR_RT		27
+#define PSR_MC		35
 #define PSR_IT		36
 #define PSR_BN		44
 
@@ -101,6 +102,8 @@
 	;;									
	\
 	dep	temp1 = 0, temp1, PSR_IC, 1;					
	\
 	;;									
	\
+	dep	temp1 = -1, temp1, PSR_MC, 1;					
	\
+	;;									
	\
 	movl	temp2 = start_addr;						
	\
 	mov	cr.ipsr = temp1;						
	\
 	;;									
	\

Comment 1 Dale Busacker 2002-10-11 21:29:11 UTC
Created attachment 80065 [details]
patch for MCA related issues in 2.4.9-34 errata kernel

Comment 2 Arjan van de Ven 2002-10-12 11:25:29 UTC
what kind of machines are affected by this ?

Comment 3 Arjan van de Ven 2003-04-05 16:35:56 UTC
closing due to long inactivity in NEEDINFO state