From a311ca2f45d953f209f01d89115153ac16ab936c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 16 Mar 2010 16:01:19 +0000 Subject: [PATCH] - Extend the machine check record structure to include several fields useful for parsing model-specific and other fields in machine check events including the global machine check capabilities and status registers, CPU identification, and the FreeBSD CPU ID. - Report these added fields in the console log of a machine check so that a record structure can be reconstituted from the console messages. - Parse new architectural errors including memory controller errors. MFC after: 1 week --- sys/amd64/amd64/mca.c | 52 ++++++++++++++++++++++++++++++++-- sys/amd64/include/mca.h | 5 ++++ sys/amd64/include/specialreg.h | 12 ++++++++ sys/i386/i386/mca.c | 52 ++++++++++++++++++++++++++++++++-- sys/i386/include/mca.h | 5 ++++ sys/i386/include/specialreg.h | 12 ++++++++ 6 files changed, 132 insertions(+), 6 deletions(-) diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index 344acaf6b225..ed55c2c1d3cd 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -186,19 +186,46 @@ mca_error_request(uint16_t mca_error) return ("???"); } +static const char * +mca_error_mmtype(uint16_t mca_error) +{ + + switch ((mca_error & 0x70) >> 4) { + case 0x0: + return ("GEN"); + case 0x1: + return ("RD"); + case 0x2: + return ("WR"); + case 0x3: + return ("AC"); + case 0x4: + return ("MS"); + } + return ("???"); +} + /* Dump details about a single machine check. */ static void __nonnull(1) mca_log(const struct mca_record *rec) { uint16_t mca_error; - printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank, + printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); - printf("MCA: CPU %d ", rec->mr_apic_id); + printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", + (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); + printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, + rec->mr_cpu_id, rec->mr_apic_id); + printf("MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) printf("UNCOR "); - else + else { printf("COR "); + if (rec->mr_mcg_cap & MCG_CAP_TES_P) + printf("(%lld) ", ((long long)rec->mr_status & + MC_STATUS_COR_COUNT) >> 38); + } if (rec->mr_status & MC_STATUS_PCC) printf("PCC "); if (rec->mr_status & MC_STATUS_OVER) @@ -221,6 +248,9 @@ mca_log(const struct mca_record *rec) case 0x0004: printf("FRC error"); break; + case 0x0005: + printf("internal parity error"); + break; case 0x0400: printf("internal timer error"); break; @@ -245,6 +275,17 @@ mca_log(const struct mca_record *rec) break; } + /* Memory controller error. */ + if ((mca_error & 0xef80) == 0x0080) { + printf("%s channel ", mca_error_mmtype(mca_error)); + if ((mca_error & 0x000f) != 0x000f) + printf("%d", mca_error & 0x000f); + else + printf("??"); + printf(" memory error"); + break; + } + /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { printf("%sCACHE %s %s error", @@ -322,6 +363,11 @@ mca_check_status(int bank, struct mca_record *rec) rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); rec->mr_tsc = rdtsc(); rec->mr_apic_id = PCPU_GET(apic_id); + rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); + rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); + rec->mr_cpu_id = cpu_id; + rec->mr_cpu_vendor_id = cpu_vendor_id; + rec->mr_cpu = PCPU_GET(cpuid); /* * Clear machine check. Don't do this for uncorrectable diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h index ddc3aeb649a8..bc09480906b1 100644 --- a/sys/amd64/include/mca.h +++ b/sys/amd64/include/mca.h @@ -37,6 +37,11 @@ struct mca_record { uint64_t mr_tsc; int mr_apic_id; int mr_bank; + uint64_t mr_mcg_cap; + uint64_t mr_mcg_status; + int mr_cpu_id; + int mr_cpu_vendor_id; + int mr_cpu; }; #ifdef _KERNEL diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index b325ed441db1..baf2466a5cf8 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -267,6 +267,7 @@ #define MSR_MTRR16kBase 0x258 #define MSR_MTRR4kBase 0x268 #define MSR_PAT 0x277 +#define MSR_MC0_CTL2 0x280 #define MSR_MTRRdefType 0x2ff #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 @@ -352,8 +353,10 @@ #define MCG_CAP_COUNT 0x000000ff #define MCG_CAP_CTL_P 0x00000100 #define MCG_CAP_EXT_P 0x00000200 +#define MCG_CAP_CMCI_P 0x00000400 #define MCG_CAP_TES_P 0x00000800 #define MCG_CAP_EXT_CNT 0x00ff0000 +#define MCG_CAP_SER_P 0x01000000 #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 @@ -363,9 +366,14 @@ #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) +#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ #define MC_STATUS_MCA_ERROR 0x000000000000ffffUL #define MC_STATUS_MODEL_ERROR 0x00000000ffff0000UL #define MC_STATUS_OTHER_INFO 0x01ffffff00000000UL +#define MC_STATUS_COR_COUNT 0x001fffc000000000UL /* If MCG_CAP_TES_P */ +#define MC_STATUS_TES_STATUS 0x0060000000000000UL /* If MCG_CAP_TES_P */ +#define MC_STATUS_AR 0x0080000000000000UL /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_S 0x0100000000000000UL /* If MCG_CAP_CMCI_P */ #define MC_STATUS_PCC 0x0200000000000000UL #define MC_STATUS_ADDRV 0x0400000000000000UL #define MC_STATUS_MISCV 0x0800000000000000UL @@ -373,6 +381,10 @@ #define MC_STATUS_UC 0x2000000000000000UL #define MC_STATUS_OVER 0x4000000000000000UL #define MC_STATUS_VAL 0x8000000000000000UL +#define MC_MISC_RA_LSB 0x000000000000003fUL /* If MCG_CAP_SER_P */ +#define MC_MISC_ADDRESS_MODE 0x00000000000001c0UL /* If MCG_CAP_SER_P */ +#define MC_CTL2_THRESHOLD 0x0000000000003fffUL +#define MC_CTL2_CMCI_EN 0x0000000040000000UL /* * The following four 3-byte registers control the non-cacheable regions. diff --git a/sys/i386/i386/mca.c b/sys/i386/i386/mca.c index 6148af792430..9b9f9451597f 100644 --- a/sys/i386/i386/mca.c +++ b/sys/i386/i386/mca.c @@ -177,19 +177,46 @@ mca_error_request(uint16_t mca_error) return ("???"); } +static const char * +mca_error_mmtype(uint16_t mca_error) +{ + + switch ((mca_error & 0x70) >> 4) { + case 0x0: + return ("GEN"); + case 0x1: + return ("RD"); + case 0x2: + return ("WR"); + case 0x3: + return ("AC"); + case 0x4: + return ("MS"); + } + return ("???"); +} + /* Dump details about a single machine check. */ static void __nonnull(1) mca_log(const struct mca_record *rec) { uint16_t mca_error; - printf("MCA: bank %d, status 0x%016llx\n", rec->mr_bank, + printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank, (long long)rec->mr_status); - printf("MCA: CPU %d ", rec->mr_apic_id); + printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n", + (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status); + printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor, + rec->mr_cpu_id, rec->mr_apic_id); + printf("MCA: CPU %d ", rec->mr_cpu); if (rec->mr_status & MC_STATUS_UC) printf("UNCOR "); - else + else { printf("COR "); + if (rec->mr_mcg_cap & MCG_CAP_TES_P) + printf("(%lld) ", ((long long)rec->mr_status & + MC_STATUS_COR_COUNT) >> 38); + } if (rec->mr_status & MC_STATUS_PCC) printf("PCC "); if (rec->mr_status & MC_STATUS_OVER) @@ -212,6 +239,9 @@ mca_log(const struct mca_record *rec) case 0x0004: printf("FRC error"); break; + case 0x0005: + printf("internal parity error"); + break; case 0x0400: printf("internal timer error"); break; @@ -236,6 +266,17 @@ mca_log(const struct mca_record *rec) break; } + /* Memory controller error. */ + if ((mca_error & 0xef80) == 0x0080) { + printf("%s channel ", mca_error_mmtype(mca_error)); + if ((mca_error & 0x000f) != 0x000f) + printf("%d", mca_error & 0x000f); + else + printf("??"); + printf(" memory error"); + break; + } + /* Cache error. */ if ((mca_error & 0xef00) == 0x0100) { printf("%sCACHE %s %s error", @@ -313,6 +354,11 @@ mca_check_status(int bank, struct mca_record *rec) rec->mr_misc = rdmsr(MSR_MC_MISC(bank)); rec->mr_tsc = rdtsc(); rec->mr_apic_id = PCPU_GET(apic_id); + rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP); + rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS); + rec->mr_cpu_id = cpu_id; + rec->mr_cpu_vendor_id = cpu_vendor_id; + rec->mr_cpu = PCPU_GET(cpuid); /* * Clear machine check. Don't do this for uncorrectable diff --git a/sys/i386/include/mca.h b/sys/i386/include/mca.h index ddc3aeb649a8..bc09480906b1 100644 --- a/sys/i386/include/mca.h +++ b/sys/i386/include/mca.h @@ -37,6 +37,11 @@ struct mca_record { uint64_t mr_tsc; int mr_apic_id; int mr_bank; + uint64_t mr_mcg_cap; + uint64_t mr_mcg_status; + int mr_cpu_id; + int mr_cpu_vendor_id; + int mr_cpu; }; #ifdef _KERNEL diff --git a/sys/i386/include/specialreg.h b/sys/i386/include/specialreg.h index e791a70f2cde..cbcc0fa09274 100644 --- a/sys/i386/include/specialreg.h +++ b/sys/i386/include/specialreg.h @@ -273,6 +273,7 @@ #define MSR_MTRR16kBase 0x258 #define MSR_MTRR4kBase 0x268 #define MSR_PAT 0x277 +#define MSR_MC0_CTL2 0x280 #define MSR_MTRRdefType 0x2ff #define MSR_MC0_CTL 0x400 #define MSR_MC0_STATUS 0x401 @@ -421,8 +422,10 @@ #define MCG_CAP_COUNT 0x000000ff #define MCG_CAP_CTL_P 0x00000100 #define MCG_CAP_EXT_P 0x00000200 +#define MCG_CAP_CMCI_P 0x00000400 #define MCG_CAP_TES_P 0x00000800 #define MCG_CAP_EXT_CNT 0x00ff0000 +#define MCG_CAP_SER_P 0x01000000 #define MCG_STATUS_RIPV 0x00000001 #define MCG_STATUS_EIPV 0x00000002 #define MCG_STATUS_MCIP 0x00000004 @@ -432,9 +435,14 @@ #define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4) #define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4) #define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4) +#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */ #define MC_STATUS_MCA_ERROR 0x000000000000ffffULL #define MC_STATUS_MODEL_ERROR 0x00000000ffff0000ULL #define MC_STATUS_OTHER_INFO 0x01ffffff00000000ULL +#define MC_STATUS_COR_COUNT 0x001fffc000000000ULL /* If MCG_CAP_TES_P */ +#define MC_STATUS_TES_STATUS 0x0060000000000000ULL /* If MCG_CAP_TES_P */ +#define MC_STATUS_AR 0x0080000000000000ULL /* If MCG_CAP_CMCI_P */ +#define MC_STATUS_S 0x0100000000000000ULL /* If MCG_CAP_CMCI_P */ #define MC_STATUS_PCC 0x0200000000000000ULL #define MC_STATUS_ADDRV 0x0400000000000000ULL #define MC_STATUS_MISCV 0x0800000000000000ULL @@ -442,6 +450,10 @@ #define MC_STATUS_UC 0x2000000000000000ULL #define MC_STATUS_OVER 0x4000000000000000ULL #define MC_STATUS_VAL 0x8000000000000000ULL +#define MC_MISC_RA_LSB 0x000000000000003fULL /* If MCG_CAP_SER_P */ +#define MC_MISC_ADDRESS_MODE 0x00000000000001c0ULL /* If MCG_CAP_SER_P */ +#define MC_CTL2_THRESHOLD 0x0000000000003fffULL +#define MC_CTL2_CMCI_EN 0x0000000040000000ULL /* * The following four 3-byte registers control the non-cacheable regions.