/* Lookup table for all possible MC control instances */
struct amd64_pvt;
-static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
-static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
+static struct mem_ctl_info *mci_lookup[EDAC_MAX_NUMNODES];
+static struct amd64_pvt *pvt_lookup[EDAC_MAX_NUMNODES];
+
+/*
+ * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
+ * for DDR2 DRAM mapping.
+ */
+u32 revf_quad_ddr2_shift[] = {
+ 0, /* 0000b NULL DIMM (128mb) */
+ 28, /* 0001b 256mb */
+ 29, /* 0010b 512mb */
+ 29, /* 0011b 512mb */
+ 29, /* 0100b 512mb */
+ 30, /* 0101b 1gb */
+ 30, /* 0110b 1gb */
+ 31, /* 0111b 2gb */
+ 31, /* 1000b 2gb */
+ 32, /* 1001b 4gb */
+ 32, /* 1010b 4gb */
+ 33, /* 1011b 8gb */
+ 0, /* 1100b future */
+ 0, /* 1101b future */
+ 0, /* 1110b future */
+ 0 /* 1111b future */
+};
+
+/*
+ * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
+ * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
+ * or higher value'.
+ *
+ *FIXME: Produce a better mapping/linearisation.
+ */
+
+struct scrubrate scrubrates[] = {
+ { 0x01, 1600000000UL},
+ { 0x02, 800000000UL},
+ { 0x03, 400000000UL},
+ { 0x04, 200000000UL},
+ { 0x05, 100000000UL},
+ { 0x06, 50000000UL},
+ { 0x07, 25000000UL},
+ { 0x08, 12284069UL},
+ { 0x09, 6274509UL},
+ { 0x0A, 3121951UL},
+ { 0x0B, 1560975UL},
+ { 0x0C, 781440UL},
+ { 0x0D, 390720UL},
+ { 0x0E, 195300UL},
+ { 0x0F, 97650UL},
+ { 0x10, 48854UL},
+ { 0x11, 24427UL},
+ { 0x12, 12213UL},
+ { 0x13, 6101UL},
+ { 0x14, 3051UL},
+ { 0x15, 1523UL},
+ { 0x16, 761UL},
+ { 0x00, 0UL}, /* scrubbing off */
+};
/*
* Memory scrubber control interface. For K8, memory scrubbing is handled by
/* Map from a CSROW entry to the mask entry that operates on it */
static inline u32 amd64_map_to_dcs_mask(struct amd64_pvt *pvt, int csrow)
{
- return csrow >> (pvt->num_dcsm >> 3);
+ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_F)
+ return csrow;
+ else
+ return csrow >> 1;
}
/* return the 'base' address the i'th CS entry of the 'dct' DRAM controller */
intlv_en = pvt->dram_IntlvEn[0];
if (intlv_en == 0) {
- for (node_id = 0; ; ) {
+ for (node_id = 0; node_id < DRAM_REG_COUNT; node_id++) {
if (amd64_base_limit_match(pvt, sys_addr, node_id))
- break;
-
- if (++node_id >= DRAM_REG_COUNT)
- goto err_no_match;
+ goto found;
}
- goto found;
+ goto err_no_match;
}
- if (unlikely((intlv_en != (0x01 << 8)) &&
- (intlv_en != (0x03 << 8)) &&
- (intlv_en != (0x07 << 8)))) {
+ if (unlikely((intlv_en != 0x01) &&
+ (intlv_en != 0x03) &&
+ (intlv_en != 0x07))) {
amd64_printk(KERN_WARNING, "junk value of 0x%x extracted from "
"IntlvEn field of DRAM Base Register for node 0: "
- "This probably indicates a BIOS bug.\n", intlv_en);
+ "this probably indicates a BIOS bug.\n", intlv_en);
return NULL;
}
bits = (((u32) sys_addr) >> 12) & intlv_en;
for (node_id = 0; ; ) {
- if ((pvt->dram_limit[node_id] & intlv_en) == bits)
+ if ((pvt->dram_IntlvSel[node_id] & intlv_en) == bits)
break; /* intlv_sel field matches */
if (++node_id >= DRAM_REG_COUNT)
/* sanity test for sys_addr */
if (unlikely(!amd64_base_limit_match(pvt, sys_addr, node_id))) {
amd64_printk(KERN_WARNING,
- "%s(): sys_addr 0x%lx falls outside base/limit "
- "address range for node %d with node interleaving "
- "enabled.\n", __func__, (unsigned long)sys_addr,
- node_id);
+ "%s(): sys_addr 0x%llx falls outside base/limit "
+ "address range for node %d with node interleaving "
+ "enabled.\n",
+ __func__, sys_addr, node_id);
return NULL;
}
* base/mask register pair, test the condition shown near the start of
* section 3.5.4 (p. 84, BKDG #26094, K8, revA-E).
*/
- for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+ for (csrow = 0; csrow < pvt->cs_count; csrow++) {
/* This DRAM chip select is disabled on this node */
if ((pvt->dcsb0[csrow] & K8_DCSB_CS_ENABLE) == 0)
u64 base, mask;
pvt = mci->pvt_info;
- BUG_ON((csrow < 0) || (csrow >= CHIPSELECT_COUNT));
+ BUG_ON((csrow < 0) || (csrow >= pvt->cs_count));
base = base_from_dct_base(pvt, csrow);
mask = mask_from_dct_mask(pvt, csrow);
* specific.
*/
static u64 extract_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt = mci->pvt_info;
goto err_reg;
}
+ return;
+
err_reg:
debugf0("Error reading F2x%03x.\n", reg);
}
*/
static void amd64_set_dct_base_and_mask(struct amd64_pvt *pvt)
{
- if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+
+ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_F) {
+ pvt->dcsb_base = REV_E_DCSB_BASE_BITS;
+ pvt->dcsm_mask = REV_E_DCSM_MASK_BITS;
+ pvt->dcs_mask_notused = REV_E_DCS_NOTUSED_BITS;
+ pvt->dcs_shift = REV_E_DCS_SHIFT;
+ pvt->cs_count = 8;
+ pvt->num_dcsm = 8;
+ } else {
pvt->dcsb_base = REV_F_F1Xh_DCSB_BASE_BITS;
pvt->dcsm_mask = REV_F_F1Xh_DCSM_MASK_BITS;
pvt->dcs_mask_notused = REV_F_F1Xh_DCS_NOTUSED_BITS;
pvt->dcs_shift = REV_F_F1Xh_DCS_SHIFT;
- switch (boot_cpu_data.x86) {
- case 0xf:
- pvt->num_dcsm = REV_F_DCSM_COUNT;
- break;
-
- case 0x10:
- pvt->num_dcsm = F10_DCSM_COUNT;
- break;
-
- case 0x11:
- pvt->num_dcsm = F11_DCSM_COUNT;
- break;
-
- default:
- amd64_printk(KERN_ERR, "Unsupported family!\n");
- break;
+ if (boot_cpu_data.x86 == 0x11) {
+ pvt->cs_count = 4;
+ pvt->num_dcsm = 2;
+ } else {
+ pvt->cs_count = 8;
+ pvt->num_dcsm = 4;
}
- } else {
- pvt->dcsb_base = REV_E_DCSB_BASE_BITS;
- pvt->dcsm_mask = REV_E_DCSM_MASK_BITS;
- pvt->dcs_mask_notused = REV_E_DCS_NOTUSED_BITS;
- pvt->dcs_shift = REV_E_DCS_SHIFT;
- pvt->num_dcsm = REV_E_DCSM_COUNT;
}
}
amd64_set_dct_base_and_mask(pvt);
- for (cs = 0; cs < CHIPSELECT_COUNT; cs++) {
+ for (cs = 0; cs < pvt->cs_count; cs++) {
reg = K8_DCSB0 + (cs * 4);
err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
&pvt->dcsb0[cs]);
}
for (cs = 0; cs < pvt->num_dcsm; cs++) {
- reg = K8_DCSB0 + (cs * 4);
+ reg = K8_DCSM0 + (cs * 4);
err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
&pvt->dcsm0[cs]);
if (unlikely(err))
/* extract the ERROR ADDRESS for the K8 CPUs */
static u64 k8_get_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
return (((u64) (info->nbeah & 0xff)) << 32) +
(info->nbeal & ~0x03);
}
static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
u64 SystemAddress)
{
struct mem_ctl_info *src_mci;
u32 page, offset;
/* Extract the syndrome parts and form a 16-bit syndrome */
- syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
- syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+ syndrome = HIGH_SYNDROME(info->nbsl) << 8;
+ syndrome |= LOW_SYNDROME(info->nbsh);
/* CHIPKILL enabled */
if (info->nbcfg & K8_NBCFG_CHIPKILL) {
* different from the node that detected the error.
*/
src_mci = find_mc_by_sys_addr(mci, SystemAddress);
- if (src_mci) {
+ if (!src_mci) {
amd64_mc_printk(mci, KERN_ERR,
"failed to map error address 0x%lx to a node\n",
(unsigned long)SystemAddress);
*/
static int f10_early_channel_count(struct amd64_pvt *pvt)
{
+ int dbams[] = { DBAM0, DBAM1 };
int err = 0, channels = 0;
+ int i, j;
u32 dbam;
err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
* is more than just one DIMM present in unganged mode. Need to check
* both controllers since DIMMs can be placed in either one.
*/
- channels = 0;
- err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam);
- if (err)
- goto err_reg;
-
- if (DBAM_DIMM(0, dbam) > 0)
- channels++;
- if (DBAM_DIMM(1, dbam) > 0)
- channels++;
- if (DBAM_DIMM(2, dbam) > 0)
- channels++;
- if (DBAM_DIMM(3, dbam) > 0)
- channels++;
-
- /* If more than 2 DIMMs are present, then we have 2 channels */
- if (channels > 2)
- channels = 2;
- else if (channels == 0) {
- /* No DIMMs on DCT0, so look at DCT1 */
- err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam);
+ for (i = 0; i < ARRAY_SIZE(dbams); i++) {
+ err = pci_read_config_dword(pvt->dram_f2_ctl, dbams[i], &dbam);
if (err)
goto err_reg;
- if (DBAM_DIMM(0, dbam) > 0)
- channels++;
- if (DBAM_DIMM(1, dbam) > 0)
- channels++;
- if (DBAM_DIMM(2, dbam) > 0)
- channels++;
- if (DBAM_DIMM(3, dbam) > 0)
- channels++;
-
- if (channels > 2)
- channels = 2;
+ for (j = 0; j < 4; j++) {
+ if (DBAM_DIMM(j, dbam) > 0) {
+ channels++;
+ break;
+ }
+ }
}
- /* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */
- if (channels == 0)
- channels = 1;
-
debugf0("MCT channel count: %d\n", channels);
return channels;
}
static u64 f10_get_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
return (((u64) (info->nbeah & 0xffff)) << 32) +
(info->nbeal & ~0x01);
pvt->dram_IntlvEn[dram] = (low_base >> 8) & 0x7;
- pvt->dram_base[dram] = (((((u64) high_base & 0x000000FF) << 32) |
- ((u64) low_base & 0xFFFF0000))) << 8;
+ pvt->dram_base[dram] = (((u64)high_base & 0x000000FF) << 40) |
+ (((u64)low_base & 0xFFFF0000) << 8);
low_offset = K8_DRAM_LIMIT_LOW + (dram << 3);
high_offset = F10_DRAM_LIMIT_HIGH + (dram << 3);
* Extract address values and form a LIMIT address. Limit is the HIGHEST
* memory location of the region, so low 24 bits need to be all ones.
*/
- low_limit |= 0x0000FFFF;
- pvt->dram_limit[dram] =
- ((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
+ pvt->dram_limit[dram] = (((u64)high_limit & 0x000000FF) << 40) |
+ (((u64) low_limit & 0xFFFF0000) << 8) |
+ 0x00FFFFFF;
}
static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
debugf1("InputAddr=0x%x channelselect=%d\n", in_addr, cs);
- for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+ for (csrow = 0; csrow < pvt->cs_count; csrow++) {
cs_base = amd64_get_dct_base(pvt, cs, csrow);
if (!(cs_base & K8_DCSB_CS_ENABLE))
* The @sys_addr is usually an error address received from the hardware.
*/
static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
u64 sys_addr)
{
struct amd64_pvt *pvt = mci->pvt_info;
if (csrow >= 0) {
error_address_to_page_and_offset(sys_addr, &page, &offset);
- syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
- syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+ syndrome = HIGH_SYNDROME(info->nbsl) << 8;
+ syndrome |= LOW_SYNDROME(info->nbsh);
/*
* Is CHIPKILL on? If so, then we can attempt to use the
* - 0: if no valid error is indicated
*/
static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *regs)
+ struct err_regs *regs)
{
struct amd64_pvt *pvt;
struct pci_dev *misc_f3_ctl;
* - 0: if no error is found
*/
static int amd64_get_error_info(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt;
- struct amd64_error_info_regs regs;
+ struct err_regs regs;
pvt = mci->pvt_info;
return 1;
}
-static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
-{
- u32 err_code;
- u32 ec_tt; /* error code transaction type (2b) */
- u32 ec_ll; /* error code cache level (2b) */
-
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_tt = EXTRACT_TT_CODE(err_code);
-
- amd64_mc_printk(mci, KERN_ERR,
- "GART TLB event: transaction type(%s), "
- "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
-}
-
-static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
-{
- u32 err_code;
- u32 ec_rrrr; /* error code memory transaction (4b) */
- u32 ec_tt; /* error code transaction type (2b) */
- u32 ec_ll; /* error code cache level (2b) */
-
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_tt = EXTRACT_TT_CODE(err_code);
- ec_rrrr = EXTRACT_RRRR_CODE(err_code);
-
- amd64_mc_printk(mci, KERN_ERR,
- "cache hierarchy error: memory transaction type(%s), "
- "transaction type(%s), cache level(%s)\n",
- rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
-}
-
-
/*
* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
* ADDRESS and process.
*/
static void amd64_handle_ce(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt = mci->pvt_info;
u64 SystemAddress;
/* Handle any Un-correctable Errors (UEs) */
static void amd64_handle_ue(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
int csrow;
u64 SystemAddress;
}
}
-static void amd64_decode_bus_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
+ struct err_regs *info)
{
- u32 err_code, ext_ec;
- u32 ec_pp; /* error code participating processor (2p) */
- u32 ec_to; /* error code timed out (1b) */
- u32 ec_rrrr; /* error code memory transaction (4b) */
- u32 ec_ii; /* error code memory or I/O (2b) */
- u32 ec_ll; /* error code cache level (2b) */
+ u32 ec = ERROR_CODE(info->nbsl);
+ u32 xec = EXT_ERROR_CODE(info->nbsl);
+ int ecc_type = info->nbsh & (0x3 << 13);
- ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
-
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_ii = EXTRACT_II_CODE(err_code);
- ec_rrrr = EXTRACT_RRRR_CODE(err_code);
- ec_to = EXTRACT_TO_CODE(err_code);
- ec_pp = EXTRACT_PP_CODE(err_code);
-
- amd64_mc_printk(mci, KERN_ERR,
- "BUS ERROR:\n"
- " time-out(%s) mem or i/o(%s)\n"
- " participating processor(%s)\n"
- " memory transaction type(%s)\n"
- " cache level(%s) Error Found by: %s\n",
- to_msgs[ec_to],
- ii_msgs[ec_ii],
- pp_msgs[ec_pp],
- rrrr_msgs[ec_rrrr],
- ll_msgs[ec_ll],
- (info->nbsh & K8_NBSH_ERR_SCRUBER) ?
- "Scrubber" : "Normal Operation");
-
- /* If this was an 'observed' error, early out */
- if (ec_pp == K8_NBSL_PP_OBS)
- return; /* We aren't the node involved */
-
- /* Parse out the extended error code for ECC events */
- switch (ext_ec) {
- /* F10 changed to one Extended ECC error code */
- case F10_NBSL_EXT_ERR_RES: /* Reserved field */
- case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
- break;
+ /* Bail early out if this was an 'observed' error */
+ if (PP(ec) == K8_NBSL_PP_OBS)
+ return;
- default:
- amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
- "handling for this error\n");
+ /* Do only ECC errors */
+ if (xec && xec != F10_NBSL_EXT_ERR_ECC)
return;
- }
- if (info->nbsh & K8_NBSH_CECC)
+ if (ecc_type == 2)
amd64_handle_ce(mci, info);
- else if (info->nbsh & K8_NBSH_UECC)
+ else if (ecc_type == 1)
amd64_handle_ue(mci, info);
/*
* catastrophic.
*/
if (info->nbsh & K8_NBSH_OVERFLOW)
- edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
- "Error Overflow set");
+ edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
}
-int amd64_process_error_info(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
- int handle_errors)
+void amd64_decode_bus_error(int node_id, struct err_regs *regs)
{
- struct amd64_pvt *pvt;
- struct amd64_error_info_regs *regs;
- u32 err_code, ext_ec;
- int gart_tlb_error = 0;
+ struct mem_ctl_info *mci = mci_lookup[node_id];
- pvt = mci->pvt_info;
-
- /* If caller doesn't want us to process the error, return */
- if (!handle_errors)
- return 1;
-
- regs = info;
-
- debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
- debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
- pvt->mc_node_id, regs->nbeah, regs->nbeal);
- debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
- regs->nbsh, regs->nbsl);
- debugf1(" Valid Error=%s Overflow=%s\n",
- (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
- (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
- debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
- (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
- "True" : "False");
- debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
- (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_PCC) ?
- "True" : "False");
- debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
- (regs->nbsh & K8_NBSH_CECC) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_UECC) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
- "True" : "False");
- debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
- (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
-
-
- err_code = EXTRACT_ERROR_CODE(regs->nbsl);
-
- /* Determine which error type:
- * 1) GART errors - non-fatal, developmental events
- * 2) MEMORY errors
- * 3) BUS errors
- * 4) Unknown error
- */
- if (TEST_TLB_ERROR(err_code)) {
- /*
- * GART errors are intended to help graphics driver developers
- * to detect bad GART PTEs. It is recommended by AMD to disable
- * GART table walk error reporting by default[1] (currently
- * being disabled in mce_cpu_quirks()) and according to the
- * comment in mce_cpu_quirks(), such GART errors can be
- * incorrectly triggered. We may see these errors anyway and
- * unless requested by the user, they won't be reported.
- *
- * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
- * AMD NPT family 0Fh processors
- */
- if (report_gart_errors == 0)
- return 1;
-
- /*
- * Only if GART error reporting is requested should we generate
- * any logs.
- */
- gart_tlb_error = 1;
-
- debugf1("GART TLB error\n");
- amd64_decode_gart_tlb_error(mci, info);
- } else if (TEST_MEM_ERROR(err_code)) {
- debugf1("Memory/Cache error\n");
- amd64_decode_mem_cache_error(mci, info);
- } else if (TEST_BUS_ERROR(err_code)) {
- debugf1("Bus (Link/DRAM) error\n");
- amd64_decode_bus_error(mci, info);
- } else {
- /* shouldn't reach here! */
- amd64_mc_printk(mci, KERN_WARNING,
- "%s(): unknown MCE error 0x%x\n", __func__,
- err_code);
- }
-
- ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
- amd64_mc_printk(mci, KERN_ERR,
- "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
-
- if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
- ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
- (ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
- EXTRACT_LDT_LINK(info->nbsh)) {
-
- amd64_mc_printk(mci, KERN_ERR,
- "Error on hypertransport link: %s\n",
- htlink_msgs[
- EXTRACT_LDT_LINK(info->nbsh)]);
- }
+ __amd64_decode_bus_error(mci, regs);
/*
* Check the UE bit of the NB status high register, if set generate some
* logs. If NOT a GART error, then process the event as a NO-INFO event.
* If it was a GART error, skip that process.
+ *
+ * FIXME: this should go somewhere else, if at all.
*/
- if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
- amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
- if (!gart_tlb_error)
- edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
- }
-
- if (regs->nbsh & K8_NBSH_PCC)
- amd64_mc_printk(mci, KERN_CRIT,
- "PCC (processor context corrupt) set\n");
+ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+ edac_mc_handle_ue_no_info(mci, "UE bit is set");
- return 1;
}
-EXPORT_SYMBOL_GPL(amd64_process_error_info);
/*
* The main polling 'check' function, called FROM the edac core to perform the
*/
static void amd64_check(struct mem_ctl_info *mci)
{
- struct amd64_error_info_regs info;
+ struct err_regs regs;
- if (amd64_get_error_info(mci, &info))
- amd64_process_error_info(mci, &info, 1);
+ if (amd64_get_error_info(mci, ®s)) {
+ struct amd64_pvt *pvt = mci->pvt_info;
+ amd_decode_nb_mce(pvt->mc_node_id, ®s, 1);
+ }
}
/*
amd64_dump_misc_regs(pvt);
+ return;
+
err_reg:
debugf0("Reading an MC register failed\n");
* NOTE: CPU Revision Dependent code
*
* Input:
- * @csrow_nr ChipSelect Row Number (0..CHIPSELECT_COUNT-1)
+ * @csrow_nr ChipSelect Row Number (0..pvt->cs_count-1)
* k8 private pointer to -->
* DRAM Bank Address mapping register
* node_id
(pvt->nbcfg & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"
);
- for (i = 0; i < CHIPSELECT_COUNT; i++) {
+ for (i = 0; i < pvt->cs_count; i++) {
csrow = &mci->csrows[i];
if ((pvt->dcsb0[i] & K8_DCSB_CS_ENABLE) == 0) {
wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
}
-static void check_mcg_ctl(void *ret)
+/* get all cores on this DCT */
+static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
{
- u64 msr_val = 0;
- u8 nbe;
+ int cpu;
- rdmsrl(MSR_IA32_MCG_CTL, msr_val);
- nbe = msr_val & K8_MSR_MCGCTL_NBE;
-
- debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
- raw_smp_processor_id(), msr_val,
- (nbe ? "enabled" : "disabled"));
-
- if (!nbe)
- *(int *)ret = 0;
+ for_each_online_cpu(cpu)
+ if (amd_get_nb_id(cpu) == nid)
+ cpumask_set_cpu(cpu, mask);
}
/* check MCG_CTL on all the cpus on this node */
-static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask)
+static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
- int ret = 1;
- preempt_disable();
- smp_call_function_many(mask, check_mcg_ctl, &ret, 1);
- preempt_enable();
+ cpumask_t mask;
+ struct msr *msrs;
+ int cpu, nbe, idx = 0;
+ bool ret = false;
+
+ cpumask_clear(&mask);
+
+ get_cpus_on_this_dct_cpumask(&mask, nid);
+
+ msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
+ if (!msrs) {
+ amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
+ __func__);
+ return false;
+ }
+
+ rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs);
+
+ for_each_cpu(cpu, &mask) {
+ nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE;
+
+ debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
+ cpu, msrs[idx].q,
+ (nbe ? "enabled" : "disabled"));
+ if (!nbe)
+ goto out;
+
+ idx++;
+ }
+ ret = true;
+
+out:
+ kfree(msrs);
return ret;
}
* the memory system completely. A command line option allows to force-enable
* hardware ECC later in amd64_enable_ecc_error_reporting().
*/
+static const char *ecc_warning =
+ "WARNING: ECC is disabled by BIOS. Module will NOT be loaded.\n"
+ " Either Enable ECC in the BIOS, or set 'ecc_enable_override'.\n"
+ " Also, use of the override can cause unknown side effects.\n";
+
static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
{
u32 value;
- int err = 0, ret = 0;
+ int err = 0;
u8 ecc_enabled = 0;
+ bool nb_mce_en = false;
err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
if (err)
debugf0("Reading K8_NBCTL failed\n");
ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE);
+ if (!ecc_enabled)
+ amd64_printk(KERN_WARNING, "This node reports that Memory ECC "
+ "is currently disabled, set F3x%x[22] (%s).\n",
+ K8_NBCFG, pci_name(pvt->misc_f3_ctl));
+ else
+ amd64_printk(KERN_INFO, "ECC is enabled by BIOS.\n");
- ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id));
-
- debugf0("K8_NBCFG=0x%x, DRAM ECC is %s\n", value,
- (value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled"));
-
- if (!ecc_enabled || !ret) {
- if (!ecc_enabled) {
- amd64_printk(KERN_WARNING, "This node reports that "
- "Memory ECC is currently "
- "disabled.\n");
+ nb_mce_en = amd64_nb_mce_bank_enabled_on_node(pvt->mc_node_id);
+ if (!nb_mce_en)
+ amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR "
+ "0x%08x[4] on node %d to enable.\n",
+ MSR_IA32_MCG_CTL, pvt->mc_node_id);
- amd64_printk(KERN_WARNING, "bit 0x%lx in register "
- "F3x%x of the MISC_CONTROL device (%s) "
- "should be enabled\n", K8_NBCFG_ECC_ENABLE,
- K8_NBCFG, pci_name(pvt->misc_f3_ctl));
- }
- if (!ret) {
- amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x "
- "of node %d should be enabled\n",
- K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL,
- pvt->mc_node_id);
- }
+ if (!ecc_enabled || !nb_mce_en) {
if (!ecc_enable_override) {
- amd64_printk(KERN_WARNING, "WARNING: ECC is NOT "
- "currently enabled by the BIOS. Module "
- "will NOT be loaded.\n"
- " Either Enable ECC in the BIOS, "
- "or use the 'ecc_enable_override' "
- "parameter.\n"
- " Might be a BIOS bug, if BIOS says "
- "ECC is enabled\n"
- " Use of the override can cause "
- "unknown side effects.\n");
- ret = -ENODEV;
- } else
- /*
- * enable further driver loading if ECC enable is
- * overridden.
- */
- ret = 0;
- } else {
- amd64_printk(KERN_INFO,
- "ECC is enabled by BIOS, Proceeding "
- "with EDAC module initialization\n");
-
+ amd64_printk(KERN_WARNING, "%s", ecc_warning);
+ return -ENODEV;
+ }
+ } else
/* CLEAR the override, since BIOS controlled it */
ecc_enable_override = 0;
- }
- return ret;
+ return 0;
}
struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) +
goto err_exit;
ret = -ENOMEM;
- mci = edac_mc_alloc(0, CHIPSELECT_COUNT, pvt->channel_count, node_id);
+ mci = edac_mc_alloc(0, pvt->cs_count, pvt->channel_count, node_id);
if (!mci)
goto err_exit;
mci_lookup[node_id] = mci;
pvt_lookup[node_id] = NULL;
+
+ /* register stuff with EDAC MCE */
+ if (report_gart_errors)
+ amd_report_gart_errors(true);
+
+ amd_register_ecc_decoder(amd64_decode_bus_error);
+
return 0;
err_add_mc:
mci_lookup[pvt->mc_node_id] = NULL;
+ /* unregister from EDAC MCE */
+ amd_report_gart_errors(false);
+ amd_unregister_ecc_decoder(amd64_decode_bus_error);
+
/* Free the EDAC CORE resources */
edac_mc_free(mci);
}