#define FERR_NF_UNCORRECTABLE (FERR_NF_M12ERR | \
FERR_NF_M11ERR | \
FERR_NF_M10ERR | \
+ FERR_NF_M9ERR | \
FERR_NF_M8ERR | \
FERR_NF_M7ERR | \
FERR_NF_M6ERR | \
#define MTR_DRAM_BANKS(mtr) ((((mtr) >> 5) & 0x1) ? 8 : 4)
#define MTR_DRAM_BANKS_ADDR_BITS(mtr) ((MTR_DRAM_BANKS(mtr) == 8) ? 3 : 2)
#define MTR_DIMM_RANK(mtr) (((mtr) >> 4) & 0x1)
-#define MTR_DIMM_RANK_ADDR_BITS(mtr) (MTR_DIM_RANKS(mtr) ? 2 : 1)
+#define MTR_DIMM_RANK_ADDR_BITS(mtr) (MTR_DIMM_RANK(mtr) ? 2 : 1)
#define MTR_DIMM_ROWS(mtr) (((mtr) >> 2) & 0x3)
#define MTR_DIMM_ROWS_ADDR_BITS(mtr) (MTR_DIMM_ROWS(mtr) + 13)
#define MTR_DIMM_COLS(mtr) ((mtr) & 0x3)
};
#endif
+/* enables the report of miscellaneous messages as CE errors - default off */
+static int misc_messages;
+
/* Enumeration of supported devices */
enum i5000_chips {
I5000P = 0,
struct pci_dev *branch_0; /* 21.0 */
struct pci_dev *branch_1; /* 22.0 */
- int node_id; /* ID of this node */
-
u16 tolm; /* top of low memory */
u64 ambase; /* AMB BAR */
u16 b1_ambpresent0; /* Branch 1, Channel 8 */
u16 b1_ambpresent1; /* Branch 1, Channel 1 */
- /* DIMM infomation matrix, allocating architecture maximums */
+ /* DIMM information matrix, allocating architecture maximums */
struct i5000_dimm_info dimm_info[MAX_CSROWS][MAX_CHANNELS];
/* Actual values for this controller */
static struct edac_pci_ctl_info *i5000_pci;
-/******************************************************************************
+/*
* i5000_get_error_info Retrieve the hardware error information from
* the hardware and cache it in the 'info'
* structure
*/
static void i5000_get_error_info(struct mem_ctl_info *mci,
- struct i5000_error_info * info)
+ struct i5000_error_info *info)
{
struct i5000_pvt *pvt;
u32 value;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
/* read in the 1st FATAL error register */
pci_read_config_dword(pvt->branchmap_werrors, FERR_FAT_FBD, &value);
}
}
-/******************************************************************************
+/*
* i5000_process_fatal_error_info(struct mem_ctl_info *mci,
* struct i5000_error_info *info,
* int handle_errors);
* handle the Intel FATAL errors, if any
*/
static void i5000_process_fatal_error_info(struct mem_ctl_info *mci,
- struct i5000_error_info * info,
+ struct i5000_error_info *info,
int handle_errors)
{
- char msg[EDAC_MC_LABEL_LEN + 1 + 90];
+ char msg[EDAC_MC_LABEL_LEN + 1 + 160];
+ char *specific = NULL;
u32 allErrors;
int branch;
int channel;
if (!allErrors)
return; /* if no error, return now */
- /* ONLY ONE of the possible error bits will be set, as per the docs */
- i5000_mc_printk(mci, KERN_ERR,
- "FATAL ERRORS Found!!! 1st FATAL Err Reg= 0x%x\n",
- allErrors);
-
branch = EXTRACT_FBDCHAN_INDX(info->ferr_fat_fbd);
channel = branch;
rdwr ? "Write" : "Read", ras, cas);
/* Only 1 bit will be on */
- if (allErrors & FERR_FAT_M1ERR) {
- i5000_mc_printk(mci, KERN_ERR,
- "Alert on non-redundant retry or fast "
- "reset timeout\n");
-
- } else if (allErrors & FERR_FAT_M2ERR) {
- i5000_mc_printk(mci, KERN_ERR,
- "Northbound CRC error on non-redundant "
- "retry\n");
-
- } else if (allErrors & FERR_FAT_M3ERR) {
- i5000_mc_printk(mci, KERN_ERR,
- ">Tmid Thermal event with intelligent "
- "throttling disabled\n");
+ switch (allErrors) {
+ case FERR_FAT_M1ERR:
+ specific = "Alert on non-redundant retry or fast "
+ "reset timeout";
+ break;
+ case FERR_FAT_M2ERR:
+ specific = "Northbound CRC error on non-redundant "
+ "retry";
+ break;
+ case FERR_FAT_M3ERR:
+ {
+ static int done;
+
+ /*
+ * This error is generated to inform that the intelligent
+ * throttling is disabled and the temperature passed the
+ * specified middle point. Since this is something the BIOS
+ * should take care of, we'll warn only once to avoid
+ * worthlessly flooding the log.
+ */
+ if (done)
+ return;
+ done++;
+
+ specific = ">Tmid Thermal event with intelligent "
+ "throttling disabled";
+ }
+ break;
}
/* Form out message */
snprintf(msg, sizeof(msg),
"(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d CAS=%d "
- "FATAL Err=0x%x)",
+ "FATAL Err=0x%x (%s))",
branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
- allErrors);
+ allErrors, specific);
/* Call the helper to output message */
edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
}
-/******************************************************************************
+/*
* i5000_process_fatal_error_info(struct mem_ctl_info *mci,
* struct i5000_error_info *info,
* int handle_errors);
* handle the Intel NON-FATAL errors, if any
*/
static void i5000_process_nonfatal_error_info(struct mem_ctl_info *mci,
- struct i5000_error_info * info,
+ struct i5000_error_info *info,
int handle_errors)
{
- char msg[EDAC_MC_LABEL_LEN + 1 + 90];
+ char msg[EDAC_MC_LABEL_LEN + 1 + 170];
+ char *specific = NULL;
u32 allErrors;
u32 ue_errors;
u32 ce_errors;
return; /* if no error, return now */
/* ONLY ONE of the possible error bits will be set, as per the docs */
- i5000_mc_printk(mci, KERN_WARNING,
- "NON-FATAL ERRORS Found!!! 1st NON-FATAL Err "
- "Reg= 0x%x\n", allErrors);
-
ue_errors = allErrors & FERR_NF_UNCORRECTABLE;
if (ue_errors) {
debugf0("\tUncorrected bits= 0x%x\n", ue_errors);
rank, channel, channel + 1, branch >> 1, bank,
rdwr ? "Write" : "Read", ras, cas);
+ switch (ue_errors) {
+ case FERR_NF_M12ERR:
+ specific = "Non-Aliased Uncorrectable Patrol Data ECC";
+ break;
+ case FERR_NF_M11ERR:
+ specific = "Non-Aliased Uncorrectable Spare-Copy "
+ "Data ECC";
+ break;
+ case FERR_NF_M10ERR:
+ specific = "Non-Aliased Uncorrectable Mirrored Demand "
+ "Data ECC";
+ break;
+ case FERR_NF_M9ERR:
+ specific = "Non-Aliased Uncorrectable Non-Mirrored "
+ "Demand Data ECC";
+ break;
+ case FERR_NF_M8ERR:
+ specific = "Aliased Uncorrectable Patrol Data ECC";
+ break;
+ case FERR_NF_M7ERR:
+ specific = "Aliased Uncorrectable Spare-Copy Data ECC";
+ break;
+ case FERR_NF_M6ERR:
+ specific = "Aliased Uncorrectable Mirrored Demand "
+ "Data ECC";
+ break;
+ case FERR_NF_M5ERR:
+ specific = "Aliased Uncorrectable Non-Mirrored Demand "
+ "Data ECC";
+ break;
+ case FERR_NF_M4ERR:
+ specific = "Uncorrectable Data ECC on Replay";
+ break;
+ }
+
/* Form out message */
snprintf(msg, sizeof(msg),
"(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
- "CAS=%d, UE Err=0x%x)",
+ "CAS=%d, UE Err=0x%x (%s))",
branch >> 1, bank, rdwr ? "Write" : "Read", ras, cas,
- ue_errors);
+ ue_errors, specific);
/* Call the helper to output message */
edac_mc_handle_fbd_ue(mci, rank, channel, channel + 1, msg);
rank, channel, branch >> 1, bank,
rdwr ? "Write" : "Read", ras, cas);
+ switch (ce_errors) {
+ case FERR_NF_M17ERR:
+ specific = "Correctable Non-Mirrored Demand Data ECC";
+ break;
+ case FERR_NF_M18ERR:
+ specific = "Correctable Mirrored Demand Data ECC";
+ break;
+ case FERR_NF_M19ERR:
+ specific = "Correctable Spare-Copy Data ECC";
+ break;
+ case FERR_NF_M20ERR:
+ specific = "Correctable Patrol Data ECC";
+ break;
+ }
+
/* Form out message */
snprintf(msg, sizeof(msg),
"(Branch=%d DRAM-Bank=%d RDWR=%s RAS=%d "
- "CAS=%d, CE Err=0x%x)", branch >> 1, bank,
- rdwr ? "Write" : "Read", ras, cas, ce_errors);
+ "CAS=%d, CE Err=0x%x (%s))", branch >> 1, bank,
+ rdwr ? "Write" : "Read", ras, cas, ce_errors,
+ specific);
/* Call the helper to output message */
edac_mc_handle_fbd_ce(mci, rank, channel, msg);
}
- /* See if any of the thermal errors have fired */
- misc_errors = allErrors & FERR_NF_THERMAL;
- if (misc_errors) {
- i5000_printk(KERN_WARNING, "\tTHERMAL Error, bits= 0x%x\n",
- misc_errors);
- }
-
- /* See if any of the thermal errors have fired */
- misc_errors = allErrors & FERR_NF_NON_RETRY;
- if (misc_errors) {
- i5000_printk(KERN_WARNING, "\tNON-Retry Errors, bits= 0x%x\n",
- misc_errors);
- }
+ if (!misc_messages)
+ return;
- /* See if any of the thermal errors have fired */
- misc_errors = allErrors & FERR_NF_NORTH_CRC;
+ misc_errors = allErrors & (FERR_NF_NON_RETRY | FERR_NF_NORTH_CRC |
+ FERR_NF_SPD_PROTOCOL | FERR_NF_DIMM_SPARE);
if (misc_errors) {
- i5000_printk(KERN_WARNING,
- "\tNORTHBOUND CRC Error, bits= 0x%x\n",
- misc_errors);
- }
+ switch (misc_errors) {
+ case FERR_NF_M13ERR:
+ specific = "Non-Retry or Redundant Retry FBD Memory "
+ "Alert or Redundant Fast Reset Timeout";
+ break;
+ case FERR_NF_M14ERR:
+ specific = "Non-Retry or Redundant Retry FBD "
+ "Configuration Alert";
+ break;
+ case FERR_NF_M15ERR:
+ specific = "Non-Retry or Redundant Retry FBD "
+ "Northbound CRC error on read data";
+ break;
+ case FERR_NF_M21ERR:
+ specific = "FBD Northbound CRC error on "
+ "FBD Sync Status";
+ break;
+ case FERR_NF_M22ERR:
+ specific = "SPD protocol error";
+ break;
+ case FERR_NF_M27ERR:
+ specific = "DIMM-spare copy started";
+ break;
+ case FERR_NF_M28ERR:
+ specific = "DIMM-spare copy completed";
+ break;
+ }
+ branch = EXTRACT_FBDCHAN_INDX(info->ferr_nf_fbd);
- /* See if any of the thermal errors have fired */
- misc_errors = allErrors & FERR_NF_SPD_PROTOCOL;
- if (misc_errors) {
- i5000_printk(KERN_WARNING,
- "\tSPD Protocol Error, bits= 0x%x\n",
- misc_errors);
- }
+ /* Form out message */
+ snprintf(msg, sizeof(msg),
+ "(Branch=%d Err=%#x (%s))", branch >> 1,
+ misc_errors, specific);
- /* See if any of the thermal errors have fired */
- misc_errors = allErrors & FERR_NF_DIMM_SPARE;
- if (misc_errors) {
- i5000_printk(KERN_WARNING, "\tDIMM-Spare Error, bits= 0x%x\n",
- misc_errors);
+ /* Call the helper to output message */
+ edac_mc_handle_fbd_ce(mci, 0, 0, msg);
}
}
-/******************************************************************************
+/*
* i5000_process_error_info Process the error info that is
* in the 'info' structure, previously retrieved from hardware
*/
static void i5000_process_error_info(struct mem_ctl_info *mci,
- struct i5000_error_info * info,
+ struct i5000_error_info *info,
int handle_errors)
{
/* First handle any fatal errors that occurred */
i5000_process_nonfatal_error_info(mci, info, handle_errors);
}
-/******************************************************************************
+/*
* i5000_clear_error Retrieve any error from the hardware
* but do NOT process that error.
* Used for 'clearing' out of previous errors
i5000_get_error_info(mci, &info);
}
-/******************************************************************************
+/*
* i5000_check_error Retrieve and process errors reported by the
* hardware. Called by the Core module.
*/
i5000_process_error_info(mci, &info, 1);
}
-/******************************************************************************
+/*
* i5000_get_devices Find and perform 'get' operation on the MCH's
* device/functions we want to reference for this driver
*
struct i5000_pvt *pvt;
struct pci_dev *pdev;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
/* Attempt to 'get' the MCH register we want */
pdev = NULL;
return 0;
}
-/******************************************************************************
+/*
* i5000_put_devices 'put' all the devices that we have
* reserved via 'get'
*/
{
struct i5000_pvt *pvt;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
pci_dev_put(pvt->branchmap_werrors); /* FUNC 1 */
pci_dev_put(pvt->fsb_error_regs); /* FUNC 2 */
pci_dev_put(pvt->branch_0); /* DEV 21 */
/* Only if more than 2 channels do we release the second branch */
- if (pvt->maxch >= CHANNELS_PER_BRANCH) {
+ if (pvt->maxch >= CHANNELS_PER_BRANCH)
pci_dev_put(pvt->branch_1); /* DEV 22 */
- }
}
-/******************************************************************************
+/*
* determine_amb_resent
*
* the information is contained in NUM_MTRS different registers
return amb_present;
}
-/******************************************************************************
+/*
* determine_mtr(pvt, csrow, channel)
*
* return the proper MTR register as determine by the csrow and channel desired
return mtr;
}
-/******************************************************************************
+/*
*/
static void decode_mtr(int slot_row, u16 mtr)
{
}
}
-/******************************************************************************
+/*
* calculate_dimm_size
*
* also will output a DIMM matrix map, if debug is enabled, for viewing
kfree(mem_buffer);
}
-/******************************************************************************
+/*
* i5000_get_mc_regs read in the necessary registers and
* cache locally
*
int maxdimmperch;
int way0, way1;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
pci_read_config_dword(pvt->system_address, AMBASE,
(u32 *) & pvt->ambase);
calculate_dimm_size(pvt);
}
-/******************************************************************************
+/*
* i5000_init_csrows Initialize the 'csrows' table within
* the mci control structure with the
* addressing of memory.
int channel;
int csrow;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
channel_count = pvt->maxch;
max_csrows = pvt->maxdimmperch * 2;
return empty;
}
-/******************************************************************************
+/*
* i5000_enable_error_reporting
* Turn on the memory reporting features of the hardware
*/
struct i5000_pvt *pvt;
u32 fbd_error_mask;
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
/* Read the FBD Error Mask Register */
pci_read_config_dword(pvt->branchmap_werrors, EMASK_FBD,
fbd_error_mask);
}
-/******************************************************************************
+/*
* i5000_get_dimm_and_channel_counts(pdev, &num_csrows, &num_channels)
*
* ask the device how many channels are present and how many CSROWS
*num_channels = (int)value;
}
-/******************************************************************************
+/*
* i5000_probe1 Probe for ONE instance of device to see if it is
* present.
* return:
if (PCI_FUNC(pdev->devfn) != 0)
return -ENODEV;
- /* make sure error reporting method is sane */
- switch (edac_op_state) {
- case EDAC_OPSTATE_POLL:
- case EDAC_OPSTATE_NMI:
- break;
- default:
- edac_op_state = EDAC_OPSTATE_POLL;
- break;
- }
-
/* Ask the devices for the number of CSROWS and CHANNELS so
* that we can calculate the memory resources, etc
*
__func__, num_channels, num_dimms_per_channel, num_csrows);
/* allocate a new MC control structure */
- mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels);
+ mci = edac_mc_alloc(sizeof(*pvt), num_csrows, num_channels, 0);
if (mci == NULL)
return -ENOMEM;
+ kobject_get(&mci->edac_mci_kobj);
debugf0("MC: " __FILE__ ": %s(): mci = %p\n", __func__, mci);
mci->dev = &pdev->dev; /* record ptr to the generic device */
- pvt = (struct i5000_pvt *)mci->pvt_info;
+ pvt = mci->pvt_info;
pvt->system_address = pdev; /* Record this device in our private */
pvt->maxch = num_channels;
pvt->maxdimmperch = num_dimms_per_channel;
}
/* add this new MC control structure to EDAC's list of MCs */
- if (edac_mc_add_mc(mci, pvt->node_id)) {
+ if (edac_mc_add_mc(mci)) {
debugf0("MC: " __FILE__
": %s(): failed edac_mc_add_mc()\n", __func__);
/* FIXME: perhaps some code should go here that disables error
i5000_put_devices(mci);
fail0:
+ kobject_put(&mci->edac_mci_kobj);
edac_mc_free(mci);
return -ENODEV;
}
-/******************************************************************************
+/*
* i5000_init_one constructor for one instance of device
*
* returns:
return i5000_probe1(pdev, id->driver_data);
}
-/**************************************************************************
+/*
* i5000_remove_one destructor for one instance of device
*
*/
/* retrieve references to resources, and free those resources */
i5000_put_devices(mci);
-
+ kobject_put(&mci->edac_mci_kobj);
edac_mc_free(mci);
}
-/**************************************************************************
+/*
* pci_device_id table for which devices we are looking for
*
* The "E500P" device is the first device supported.
MODULE_DEVICE_TABLE(pci, i5000_pci_tbl);
-/**************************************************************************
+/*
* i5000_driver pci_driver structure for this module
*
*/
static struct pci_driver i5000_driver = {
- .name = __stringify(KBUILD_BASENAME),
+ .name = KBUILD_BASENAME,
.probe = i5000_init_one,
.remove = __devexit_p(i5000_remove_one),
.id_table = i5000_pci_tbl,
};
-/**************************************************************************
+/*
* i5000_init Module entry function
* Try to initialize this module for its devices
*/
debugf2("MC: " __FILE__ ": %s()\n", __func__);
+ /* Ensure that the OPSTATE is set correctly for POLL or NMI */
+ opstate_init();
+
pci_rc = pci_register_driver(&i5000_driver);
return (pci_rc < 0) ? pci_rc : 0;
}
-/**************************************************************************
+/*
* i5000_exit() Module exit function
* Unregister the driver
*/
("Linux Networx (http://lnxi.com) Doug Thompson <norsk5@xmission.com>");
MODULE_DESCRIPTION("MC Driver for Intel I5000 memory controllers - "
I5000_REVISION);
+
module_param(edac_op_state, int, 0444);
MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
+module_param(misc_messages, int, 0444);
+MODULE_PARM_DESC(misc_messages, "Log miscellaneous non fatal messages");
+