/* Lookup table for all possible MC control instances */
struct amd64_pvt;
-static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
-static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
+static struct mem_ctl_info *mci_lookup[EDAC_MAX_NUMNODES];
+static struct amd64_pvt *pvt_lookup[EDAC_MAX_NUMNODES];
/*
* See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
/* Map from a CSROW entry to the mask entry that operates on it */
static inline u32 amd64_map_to_dcs_mask(struct amd64_pvt *pvt, int csrow)
{
- return csrow >> (pvt->num_dcsm >> 3);
+ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_F)
+ return csrow;
+ else
+ return csrow >> 1;
}
/* return the 'base' address the i'th CS entry of the 'dct' DRAM controller */
intlv_en = pvt->dram_IntlvEn[0];
if (intlv_en == 0) {
- for (node_id = 0; ; ) {
+ for (node_id = 0; node_id < DRAM_REG_COUNT; node_id++) {
if (amd64_base_limit_match(pvt, sys_addr, node_id))
- break;
-
- if (++node_id >= DRAM_REG_COUNT)
- goto err_no_match;
+ goto found;
}
- goto found;
+ goto err_no_match;
}
- if (unlikely((intlv_en != (0x01 << 8)) &&
- (intlv_en != (0x03 << 8)) &&
- (intlv_en != (0x07 << 8)))) {
+ if (unlikely((intlv_en != 0x01) &&
+ (intlv_en != 0x03) &&
+ (intlv_en != 0x07))) {
amd64_printk(KERN_WARNING, "junk value of 0x%x extracted from "
"IntlvEn field of DRAM Base Register for node 0: "
- "This probably indicates a BIOS bug.\n", intlv_en);
+ "this probably indicates a BIOS bug.\n", intlv_en);
return NULL;
}
bits = (((u32) sys_addr) >> 12) & intlv_en;
for (node_id = 0; ; ) {
- if ((pvt->dram_limit[node_id] & intlv_en) == bits)
+ if ((pvt->dram_IntlvSel[node_id] & intlv_en) == bits)
break; /* intlv_sel field matches */
if (++node_id >= DRAM_REG_COUNT)
/* sanity test for sys_addr */
if (unlikely(!amd64_base_limit_match(pvt, sys_addr, node_id))) {
amd64_printk(KERN_WARNING,
- "%s(): sys_addr 0x%lx falls outside base/limit "
- "address range for node %d with node interleaving "
- "enabled.\n", __func__, (unsigned long)sys_addr,
- node_id);
+ "%s(): sys_addr 0x%llx falls outside base/limit "
+ "address range for node %d with node interleaving "
+ "enabled.\n",
+ __func__, sys_addr, node_id);
return NULL;
}
* base/mask register pair, test the condition shown near the start of
* section 3.5.4 (p. 84, BKDG #26094, K8, revA-E).
*/
- for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+ for (csrow = 0; csrow < pvt->cs_count; csrow++) {
/* This DRAM chip select is disabled on this node */
if ((pvt->dcsb0[csrow] & K8_DCSB_CS_ENABLE) == 0)
u64 base, mask;
pvt = mci->pvt_info;
- BUG_ON((csrow < 0) || (csrow >= CHIPSELECT_COUNT));
+ BUG_ON((csrow < 0) || (csrow >= pvt->cs_count));
base = base_from_dct_base(pvt, csrow);
mask = mask_from_dct_mask(pvt, csrow);
*/
static void amd64_set_dct_base_and_mask(struct amd64_pvt *pvt)
{
- if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+
+ if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_F) {
+ pvt->dcsb_base = REV_E_DCSB_BASE_BITS;
+ pvt->dcsm_mask = REV_E_DCSM_MASK_BITS;
+ pvt->dcs_mask_notused = REV_E_DCS_NOTUSED_BITS;
+ pvt->dcs_shift = REV_E_DCS_SHIFT;
+ pvt->cs_count = 8;
+ pvt->num_dcsm = 8;
+ } else {
pvt->dcsb_base = REV_F_F1Xh_DCSB_BASE_BITS;
pvt->dcsm_mask = REV_F_F1Xh_DCSM_MASK_BITS;
pvt->dcs_mask_notused = REV_F_F1Xh_DCS_NOTUSED_BITS;
pvt->dcs_shift = REV_F_F1Xh_DCS_SHIFT;
- switch (boot_cpu_data.x86) {
- case 0xf:
- pvt->num_dcsm = REV_F_DCSM_COUNT;
- break;
-
- case 0x10:
- pvt->num_dcsm = F10_DCSM_COUNT;
- break;
-
- case 0x11:
- pvt->num_dcsm = F11_DCSM_COUNT;
- break;
-
- default:
- amd64_printk(KERN_ERR, "Unsupported family!\n");
- break;
+ if (boot_cpu_data.x86 == 0x11) {
+ pvt->cs_count = 4;
+ pvt->num_dcsm = 2;
+ } else {
+ pvt->cs_count = 8;
+ pvt->num_dcsm = 4;
}
- } else {
- pvt->dcsb_base = REV_E_DCSB_BASE_BITS;
- pvt->dcsm_mask = REV_E_DCSM_MASK_BITS;
- pvt->dcs_mask_notused = REV_E_DCS_NOTUSED_BITS;
- pvt->dcs_shift = REV_E_DCS_SHIFT;
- pvt->num_dcsm = REV_E_DCSM_COUNT;
}
}
amd64_set_dct_base_and_mask(pvt);
- for (cs = 0; cs < CHIPSELECT_COUNT; cs++) {
+ for (cs = 0; cs < pvt->cs_count; cs++) {
reg = K8_DCSB0 + (cs * 4);
err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
&pvt->dcsb0[cs]);
* different from the node that detected the error.
*/
src_mci = find_mc_by_sys_addr(mci, SystemAddress);
- if (src_mci) {
+ if (!src_mci) {
amd64_mc_printk(mci, KERN_ERR,
"failed to map error address 0x%lx to a node\n",
(unsigned long)SystemAddress);
*/
static int f10_early_channel_count(struct amd64_pvt *pvt)
{
+ int dbams[] = { DBAM0, DBAM1 };
int err = 0, channels = 0;
+ int i, j;
u32 dbam;
err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
* is more than just one DIMM present in unganged mode. Need to check
* both controllers since DIMMs can be placed in either one.
*/
- channels = 0;
- err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam);
- if (err)
- goto err_reg;
-
- if (DBAM_DIMM(0, dbam) > 0)
- channels++;
- if (DBAM_DIMM(1, dbam) > 0)
- channels++;
- if (DBAM_DIMM(2, dbam) > 0)
- channels++;
- if (DBAM_DIMM(3, dbam) > 0)
- channels++;
-
- /* If more than 2 DIMMs are present, then we have 2 channels */
- if (channels > 2)
- channels = 2;
- else if (channels == 0) {
- /* No DIMMs on DCT0, so look at DCT1 */
- err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam);
+ for (i = 0; i < ARRAY_SIZE(dbams); i++) {
+ err = pci_read_config_dword(pvt->dram_f2_ctl, dbams[i], &dbam);
if (err)
goto err_reg;
- if (DBAM_DIMM(0, dbam) > 0)
- channels++;
- if (DBAM_DIMM(1, dbam) > 0)
- channels++;
- if (DBAM_DIMM(2, dbam) > 0)
- channels++;
- if (DBAM_DIMM(3, dbam) > 0)
- channels++;
-
- if (channels > 2)
- channels = 2;
+ for (j = 0; j < 4; j++) {
+ if (DBAM_DIMM(j, dbam) > 0) {
+ channels++;
+ break;
+ }
+ }
}
- /* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */
- if (channels == 0)
- channels = 1;
-
debugf0("MCT channel count: %d\n", channels);
return channels;
pvt->dram_IntlvEn[dram] = (low_base >> 8) & 0x7;
- pvt->dram_base[dram] = (((((u64) high_base & 0x000000FF) << 32) |
- ((u64) low_base & 0xFFFF0000))) << 8;
+ pvt->dram_base[dram] = (((u64)high_base & 0x000000FF) << 40) |
+ (((u64)low_base & 0xFFFF0000) << 8);
low_offset = K8_DRAM_LIMIT_LOW + (dram << 3);
high_offset = F10_DRAM_LIMIT_HIGH + (dram << 3);
* Extract address values and form a LIMIT address. Limit is the HIGHEST
* memory location of the region, so low 24 bits need to be all ones.
*/
- low_limit |= 0x0000FFFF;
- pvt->dram_limit[dram] =
- ((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
+ pvt->dram_limit[dram] = (((u64)high_limit & 0x000000FF) << 40) |
+ (((u64) low_limit & 0xFFFF0000) << 8) |
+ 0x00FFFFFF;
}
static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
debugf1("InputAddr=0x%x channelselect=%d\n", in_addr, cs);
- for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+ for (csrow = 0; csrow < pvt->cs_count; csrow++) {
cs_base = amd64_get_dct_base(pvt, cs, csrow);
if (!(cs_base & K8_DCSB_CS_ENABLE))
}
}
-static void amd64_decode_bus_error(struct mem_ctl_info *mci,
- struct err_regs *info, int ecc_type)
+static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
+ struct err_regs *info)
{
u32 ec = ERROR_CODE(info->nbsl);
u32 xec = EXT_ERROR_CODE(info->nbsl);
-
- pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n",
- RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec), PP_MSG(ec));
-
+ int ecc_type = info->nbsh & (0x3 << 13);
/* Bail early out if this was an 'observed' error */
if (PP(ec) == K8_NBSL_PP_OBS)
edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
}
-void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
- int handle_errors)
+void amd64_decode_bus_error(int node_id, struct err_regs *regs)
{
- struct amd64_pvt *pvt = mci->pvt_info;
- int ecc;
- u32 ec = ERROR_CODE(regs->nbsl);
- u32 xec = EXT_ERROR_CODE(regs->nbsl);
+ struct mem_ctl_info *mci = mci_lookup[node_id];
- if (!handle_errors)
- return;
-
- pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);
-
- /*
- * F10h, revD can disable ErrCpu[3:0] so check that first and also the
- * value encoding has changed so interpret those differently
- */
- if ((boot_cpu_data.x86 == 0x10) &&
- (boot_cpu_data.x86_model > 8)) {
- if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
- pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
- } else {
- pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
- }
-
- pr_emerg(" Error: %sorrected",
- ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
- pr_cont(", Report Error: %s",
- ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
- pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
- ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
- ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
-
- /* do the two bits[14:13] together */
- ecc = regs->nbsh & (0x3 << 13);
- if (ecc)
- pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
-
- pr_cont("\n");
-
- if (TLB_ERROR(ec)) {
- /*
- * GART errors are intended to help graphics driver developers
- * to detect bad GART PTEs. It is recommended by AMD to disable
- * GART table walk error reporting by default[1] (currently
- * being disabled in mce_cpu_quirks()) and according to the
- * comment in mce_cpu_quirks(), such GART errors can be
- * incorrectly triggered. We may see these errors anyway and
- * unless requested by the user, they won't be reported.
- *
- * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
- * AMD NPT family 0Fh processors
- */
- if (!report_gart_errors)
- return;
-
- pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
- TT_MSG(ec), LL_MSG(ec));
- } else if (MEM_ERROR(ec)) {
- pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
- " Cache Level: %s",
- RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
- } else if (BUS_ERROR(ec)) {
- pr_emerg(" Bus (Link/DRAM) error\n");
- amd64_decode_bus_error(mci, regs, ecc);
- } else {
- /* shouldn't reach here! */
- amd64_mc_printk(mci, KERN_WARNING,
- "%s(): unknown MCE error 0x%x\n", __func__, ec);
- }
-
- pr_emerg("%s.\n", EXT_ERR_MSG(xec));
+ __amd64_decode_bus_error(mci, regs);
/*
* Check the UE bit of the NB status high register, if set generate some
* logs. If NOT a GART error, then process the event as a NO-INFO event.
* If it was a GART error, skip that process.
+ *
+ * FIXME: this should go somewhere else, if at all.
*/
if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
edac_mc_handle_ue_no_info(mci, "UE bit is set");
+
}
/*
{
struct err_regs regs;
- if (amd64_get_error_info(mci, ®s))
- amd64_decode_nb_mce(mci, ®s, 1);
+ if (amd64_get_error_info(mci, ®s)) {
+ struct amd64_pvt *pvt = mci->pvt_info;
+ amd_decode_nb_mce(pvt->mc_node_id, ®s, 1);
+ }
}
/*
* NOTE: CPU Revision Dependent code
*
* Input:
- * @csrow_nr ChipSelect Row Number (0..CHIPSELECT_COUNT-1)
+ * @csrow_nr ChipSelect Row Number (0..pvt->cs_count-1)
* k8 private pointer to -->
* DRAM Bank Address mapping register
* node_id
(pvt->nbcfg & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"
);
- for (i = 0; i < CHIPSELECT_COUNT; i++) {
+ for (i = 0; i < pvt->cs_count; i++) {
csrow = &mci->csrows[i];
if ((pvt->dcsb0[i] & K8_DCSB_CS_ENABLE) == 0) {
wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
}
-static void check_mcg_ctl(void *ret)
+/* get all cores on this DCT */
+static void get_cpus_on_this_dct_cpumask(cpumask_t *mask, int nid)
{
- u64 msr_val = 0;
- u8 nbe;
-
- rdmsrl(MSR_IA32_MCG_CTL, msr_val);
- nbe = msr_val & K8_MSR_MCGCTL_NBE;
+ int cpu;
- debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
- raw_smp_processor_id(), msr_val,
- (nbe ? "enabled" : "disabled"));
-
- if (!nbe)
- *(int *)ret = 0;
+ for_each_online_cpu(cpu)
+ if (amd_get_nb_id(cpu) == nid)
+ cpumask_set_cpu(cpu, mask);
}
/* check MCG_CTL on all the cpus on this node */
-static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask)
+static bool amd64_nb_mce_bank_enabled_on_node(int nid)
{
- int ret = 1;
- preempt_disable();
- smp_call_function_many(mask, check_mcg_ctl, &ret, 1);
- preempt_enable();
+ cpumask_t mask;
+ struct msr *msrs;
+ int cpu, nbe, idx = 0;
+ bool ret = false;
+
+ cpumask_clear(&mask);
+
+ get_cpus_on_this_dct_cpumask(&mask, nid);
+
+ msrs = kzalloc(sizeof(struct msr) * cpumask_weight(&mask), GFP_KERNEL);
+ if (!msrs) {
+ amd64_printk(KERN_WARNING, "%s: error allocating msrs\n",
+ __func__);
+ return false;
+ }
+
+ rdmsr_on_cpus(&mask, MSR_IA32_MCG_CTL, msrs);
+
+ for_each_cpu(cpu, &mask) {
+ nbe = msrs[idx].l & K8_MSR_MCGCTL_NBE;
+
+ debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
+ cpu, msrs[idx].q,
+ (nbe ? "enabled" : "disabled"));
+
+ if (!nbe)
+ goto out;
+
+ idx++;
+ }
+ ret = true;
+out:
+ kfree(msrs);
return ret;
}
* the memory system completely. A command line option allows to force-enable
* hardware ECC later in amd64_enable_ecc_error_reporting().
*/
+static const char *ecc_warning =
+ "WARNING: ECC is disabled by BIOS. Module will NOT be loaded.\n"
+ " Either Enable ECC in the BIOS, or set 'ecc_enable_override'.\n"
+ " Also, use of the override can cause unknown side effects.\n";
+
static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
{
u32 value;
- int err = 0, ret = 0;
+ int err = 0;
u8 ecc_enabled = 0;
+ bool nb_mce_en = false;
err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
if (err)
debugf0("Reading K8_NBCTL failed\n");
ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE);
+ if (!ecc_enabled)
+ amd64_printk(KERN_WARNING, "This node reports that Memory ECC "
+ "is currently disabled, set F3x%x[22] (%s).\n",
+ K8_NBCFG, pci_name(pvt->misc_f3_ctl));
+ else
+ amd64_printk(KERN_INFO, "ECC is enabled by BIOS.\n");
- ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id));
-
- debugf0("K8_NBCFG=0x%x, DRAM ECC is %s\n", value,
- (value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled"));
-
- if (!ecc_enabled || !ret) {
- if (!ecc_enabled) {
- amd64_printk(KERN_WARNING, "This node reports that "
- "Memory ECC is currently "
- "disabled.\n");
+ nb_mce_en = amd64_nb_mce_bank_enabled_on_node(pvt->mc_node_id);
+ if (!nb_mce_en)
+ amd64_printk(KERN_WARNING, "NB MCE bank disabled, set MSR "
+ "0x%08x[4] on node %d to enable.\n",
+ MSR_IA32_MCG_CTL, pvt->mc_node_id);
- amd64_printk(KERN_WARNING, "bit 0x%lx in register "
- "F3x%x of the MISC_CONTROL device (%s) "
- "should be enabled\n", K8_NBCFG_ECC_ENABLE,
- K8_NBCFG, pci_name(pvt->misc_f3_ctl));
- }
- if (!ret) {
- amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x "
- "of node %d should be enabled\n",
- K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL,
- pvt->mc_node_id);
- }
+ if (!ecc_enabled || !nb_mce_en) {
if (!ecc_enable_override) {
- amd64_printk(KERN_WARNING, "WARNING: ECC is NOT "
- "currently enabled by the BIOS. Module "
- "will NOT be loaded.\n"
- " Either Enable ECC in the BIOS, "
- "or use the 'ecc_enable_override' "
- "parameter.\n"
- " Might be a BIOS bug, if BIOS says "
- "ECC is enabled\n"
- " Use of the override can cause "
- "unknown side effects.\n");
- ret = -ENODEV;
- } else
- /*
- * enable further driver loading if ECC enable is
- * overridden.
- */
- ret = 0;
- } else {
- amd64_printk(KERN_INFO,
- "ECC is enabled by BIOS, Proceeding "
- "with EDAC module initialization\n");
-
- /* Signal good ECC status */
- ret = 0;
-
+ amd64_printk(KERN_WARNING, "%s", ecc_warning);
+ return -ENODEV;
+ }
+ } else
/* CLEAR the override, since BIOS controlled it */
ecc_enable_override = 0;
- }
- return ret;
+ return 0;
}
struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) +
goto err_exit;
ret = -ENOMEM;
- mci = edac_mc_alloc(0, CHIPSELECT_COUNT, pvt->channel_count, node_id);
+ mci = edac_mc_alloc(0, pvt->cs_count, pvt->channel_count, node_id);
if (!mci)
goto err_exit;
mci_lookup[node_id] = mci;
pvt_lookup[node_id] = NULL;
+
+ /* register stuff with EDAC MCE */
+ if (report_gart_errors)
+ amd_report_gart_errors(true);
+
+ amd_register_ecc_decoder(amd64_decode_bus_error);
+
return 0;
err_add_mc:
mci_lookup[pvt->mc_node_id] = NULL;
+ /* unregister from EDAC MCE */
+ amd_report_gart_errors(false);
+ amd_unregister_ecc_decoder(amd64_decode_bus_error);
+
/* Free the EDAC CORE resources */
edac_mc_free(mci);
}