# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1246372839 -3600
# Node ID 7bbbc57163d58c27f1e3883b20d09c72d04351ab
# Parent 00502df38143d6c26a6db43f9329634cdef76f3e
x86 hvm mce: Support HVM Guest virtual MCA handling.
When MCE# happens, if the error has been contained/recovered by XEN
and it impacts one guest Domain(DOM0/HVM Guest/PV Guest), we will
inject the corresponding vMCE# into the impacted Domain. Guest OS will
go on its own recovery job if it has MCA handler.
Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
---
xen/arch/x86/cpu/mcheck/mce_intel.c | 157 ++++++++++++++++++++++++++++--------
xen/arch/x86/cpu/mcheck/mctelem.c | 4
xen/arch/x86/cpu/mcheck/mctelem.h | 2
xen/arch/x86/hvm/hvm.c | 37 +++++---
xen/arch/x86/hvm/irq.c | 7 +
xen/arch/x86/hvm/vmx/intr.c | 4
xen/arch/x86/x86_64/traps.c | 9 +-
xen/include/asm-x86/domain.h | 1
xen/include/asm-x86/hvm/hvm.h | 4
9 files changed, 169 insertions(+), 56 deletions(-)
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Jun 30 15:40:39 2009 +0100
@@ -10,6 +10,7 @@
#include <public/sysctl.h>
#include <asm/system.h>
#include <asm/msr.h>
+#include <asm/p2m.h>
#include "mce.h"
#include "x86_mca.h"
@@ -224,7 +225,7 @@ static struct bank_entry* alloc_bank_ent
for vMCE# MSRs virtualization
*/
-static int fill_vmsr_data(int cpu, struct mcinfo_bank *mc_bank,
+static int fill_vmsr_data(struct mcinfo_bank *mc_bank,
uint64_t gstatus) {
struct domain *d;
struct bank_entry *entry;
@@ -240,28 +241,89 @@ static int fill_vmsr_data(int cpu, struc
return 0;
}
+ /* For HVM guest, Only when first vMCE is consumed by HVM guest
successfully,
+ * will we generete another node and inject another vMCE
+ */
+ if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
+ {
+ printk(KERN_DEBUG "MCE: HVM guest has not handled previous"
+ " vMCE yet!\n");
+ return -1;
+ }
entry = alloc_bank_entry();
if (entry == NULL)
- return -1;
+ return -1;
+
entry->mci_status = mc_bank->mc_status;
entry->mci_addr = mc_bank->mc_addr;
entry->mci_misc = mc_bank->mc_misc;
- entry->cpu = cpu;
entry->bank = mc_bank->mc_bank;
- spin_lock(&d->arch.vmca_msrs.lock);
+ spin_lock(&d->arch.vmca_msrs.lock);
/* New error Node, insert to the tail of the per_dom data */
list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
/* Fill MSR global status */
d->arch.vmca_msrs.mcg_status = gstatus;
/* New node impact the domain, need another vMCE# injection*/
d->arch.vmca_msrs.nr_injection++;
- spin_unlock(&d->arch.vmca_msrs.lock);
-
- printk(KERN_DEBUG "MCE: Found error @[CPU%d BANK%d "
+ spin_unlock(&d->arch.vmca_msrs.lock);
+
+ printk(KERN_DEBUG "MCE: Found error @[BANK%d "
"status %"PRIx64" addr %"PRIx64" domid %d]\n ",
- entry->cpu, mc_bank->mc_bank,
- mc_bank->mc_status, mc_bank->mc_addr, mc_bank->mc_domid);
+ mc_bank->mc_bank, mc_bank->mc_status, mc_bank->mc_addr,
+ mc_bank->mc_domid);
+ }
+ return 0;
+}
+
+static int inject_mce(struct domain *d)
+{
+ int cpu = smp_processor_id();
+ cpumask_t affinity;
+
+ /* PV guest and HVM guest have different vMCE# injection
+ * methods*/
+
+ if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
+ {
+ if (d->is_hvm)
+ {
+ printk(KERN_DEBUG "MCE: inject vMCE to HVM DOM %d\n",
+ d->domain_id);
+ vcpu_kick(d->vcpu[0]);
+ }
+ /* PV guest including DOM0 */
+ else
+ {
+ printk(KERN_DEBUG "MCE: inject vMCE to PV DOM%d\n",
+ d->domain_id);
+ if (guest_has_trap_callback
+ (d, 0, TRAP_machine_check))
+ {
+ d->vcpu[0]->cpu_affinity_tmp =
+ d->vcpu[0]->cpu_affinity;
+ cpus_clear(affinity);
+ cpu_set(cpu, affinity);
+ printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
+ d->vcpu[0]->processor);
+ vcpu_set_affinity(d->vcpu[0], &affinity);
+ vcpu_kick(d->vcpu[0]);
+ }
+ else
+ {
+ printk(KERN_DEBUG "MCE: Kill PV guest with No MCE handler\n");
+ domain_crash(d);
+ }
+ }
+ }
+ else {
+ /* new vMCE comes while first one has not been injected yet,
+ * in this case, inject fail. [We can't lose this vMCE for
+ * the mce node's consistency].
+ */
+ printk(KERN_DEBUG "There's a pending vMCE waiting to be injected "
+ " to this DOM%d!\n", d->domain_id);
+ return -1;
}
return 0;
}
@@ -272,7 +334,7 @@ void intel_UCR_handler(struct mcinfo_ban
struct mca_handle_result *result)
{
struct domain *d;
- unsigned long mfn;
+ unsigned long mfn, gfn;
uint32_t status;
printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
@@ -280,6 +342,7 @@ void intel_UCR_handler(struct mcinfo_ban
if (bank->mc_addr != 0) {
mfn = bank->mc_addr >> PAGE_SHIFT;
if (!offline_page(mfn, 1, &status)) {
+ /* This is free page */
if (status & PG_OFFLINE_OFFLINED)
result->result = MCA_RECOVERED;
else if (status & PG_OFFLINE_PENDING) {
@@ -289,9 +352,35 @@ void intel_UCR_handler(struct mcinfo_ban
result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
printk(KERN_DEBUG "MCE: This error page is ownded"
" by DOM %d\n", result->owner);
- if (result->owner != 0 && result->owner != DOMID_XEN) {
+ /* Fill vMCE# injection and vMCE# MSR virtualization "
+ * "related data */
+ bank->mc_domid = result->owner;
+ if ( result->owner != DOMID_XEN ) {
d = get_domain_by_id(result->owner);
- domain_crash(d);
+ gfn =
+ mfn_to_gmfn(d, ((bank->mc_addr) >> PAGE_SHIFT));
+ bank->mc_addr =
+ gfn << PAGE_SHIFT | (bank->mc_addr & PAGE_MASK);
+ if (fill_vmsr_data(bank, global->mc_gstatus) == -1)
+ {
+ printk(KERN_DEBUG "Fill vMCE# data for DOM%d "
+ "failed\n", result->owner);
+ domain_crash(d);
+ return;
+ }
+ /* We will inject vMCE to DOMU*/
+ if ( inject_mce(d) < 0 )
+ {
+ printk(KERN_DEBUG "inject vMCE to DOM%d"
+ " failed\n", d->domain_id);
+ domain_crash(d);
+ return;
+ }
+ /* Impacted domain go on with domain's recovery job
+ * if the domain has its own MCA handler.
+ * For xen, it has contained the error and finished
+ * its own recovery job.
+ */
result->result = MCA_RECOVERED;
}
}
@@ -309,7 +398,7 @@ struct mca_error_handler intel_recovery_
* should be committed for dom0 consumption, 0 if it should be
* dismissed.
*/
-static int mce_action(unsigned int cpu, mctelem_cookie_t mctc)
+static int mce_action(mctelem_cookie_t mctc)
{
struct mc_info *local_mi;
uint32_t i;
@@ -335,9 +424,6 @@ static int mce_action(unsigned int cpu,
continue;
}
mc_bank = (struct mcinfo_bank*)mic;
- /* Fill vMCE# injection and vMCE# MSR virtualization related data */
- if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1)
- break;
/* TODO: Add recovery actions here, such as page-offline, etc */
memset(&mca_res, 0x0f, sizeof(mca_res));
@@ -386,7 +472,6 @@ static void mce_softirq(void)
{
int cpu = smp_processor_id();
unsigned int workcpu;
- cpumask_t affinity;
printk(KERN_DEBUG "CPU%d enter softirq\n", cpu);
@@ -417,27 +502,13 @@ static void mce_softirq(void)
* vMCE MSRs virtualization buffer
*/
for_each_online_cpu(workcpu) {
- mctelem_process_deferred(workcpu, mce_action);
+ mctelem_process_deferred(workcpu, mce_action);
}
/* Step2: Send Log to DOM0 through vIRQ */
if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
printk(KERN_DEBUG "MCE: send MCE# to DOM0 through virq\n");
send_guest_global_virq(dom0, VIRQ_MCA);
- }
-
- /* Step3: Inject vMCE to impacted DOM. Currently we cares DOM0 only */
- if (guest_has_trap_callback
- (dom0, 0, TRAP_machine_check) &&
- !test_and_set_bool(dom0->vcpu[0]->mce_pending)) {
- dom0->vcpu[0]->cpu_affinity_tmp =
- dom0->vcpu[0]->cpu_affinity;
- cpus_clear(affinity);
- cpu_set(cpu, affinity);
- printk(KERN_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu,
- dom0->vcpu[0]->processor);
- vcpu_set_affinity(dom0->vcpu[0], &affinity);
- vcpu_kick(dom0->vcpu[0]);
}
}
@@ -1057,7 +1128,27 @@ int intel_mce_wrmsr(u32 msr, u64 value)
break;
case MSR_IA32_MCG_STATUS:
d->arch.vmca_msrs.mcg_status = value;
- gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_CTL %"PRIx64"\n", value);
+ gdprintk(XENLOG_DEBUG, "MCE: wrmsr MCG_STATUS %"PRIx64"\n", value);
+ /* For HVM guest, this is the point for deleting vMCE injection node */
+ if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection >0) )
+ {
+ d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
+ if (!list_empty(&d->arch.vmca_msrs.impact_header)) {
+ entry = list_entry(d->arch.vmca_msrs.impact_header.next,
+ struct bank_entry, list);
+ if (entry->mci_status & MCi_STATUS_VAL)
+ gdprintk(XENLOG_ERR, "MCE: MCi_STATUS MSR should have "
+ "been cleared before write MCG_STATUS MSR\n");
+
+ gdprintk(XENLOG_DEBUG, "MCE: Delete HVM last injection "
+ "Node, nr_injection %u\n",
+ d->arch.vmca_msrs.nr_injection);
+ list_del(&entry->list);
+ }
+ else
+ gdprintk(XENLOG_DEBUG, "MCE: Not found HVM guest"
+ " last injection Node, something Wrong!\n");
+ }
break;
case MSR_IA32_MCG_CAP:
gdprintk(XENLOG_WARNING, "MCE: MCG_CAP is read-only\n");
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mctelem.c
--- a/xen/arch/x86/cpu/mcheck/mctelem.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mctelem.c Tue Jun 30 15:40:39 2009 +0100
@@ -153,7 +153,7 @@ void mctelem_defer(mctelem_cookie_t cook
}
void mctelem_process_deferred(unsigned int cpu,
- int (*fn)(unsigned int, mctelem_cookie_t))
+ int (*fn)(mctelem_cookie_t))
{
struct mctelem_ent *tep;
struct mctelem_ent *head, *prev;
@@ -189,7 +189,7 @@ void mctelem_process_deferred(unsigned i
prev = tep->mcte_prev;
tep->mcte_next = tep->mcte_prev = NULL;
- ret = fn(cpu, MCTE2COOKIE(tep));
+ ret = fn(MCTE2COOKIE(tep));
if (prev != NULL)
prev->mcte_next = NULL;
tep->mcte_prev = tep->mcte_next = NULL;
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/cpu/mcheck/mctelem.h
--- a/xen/arch/x86/cpu/mcheck/mctelem.h Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mctelem.h Tue Jun 30 15:40:39 2009 +0100
@@ -69,7 +69,7 @@ extern void mctelem_ack(mctelem_class_t,
extern void mctelem_ack(mctelem_class_t, mctelem_cookie_t);
extern void mctelem_defer(mctelem_cookie_t);
extern void mctelem_process_deferred(unsigned int,
- int (*)(unsigned int, mctelem_cookie_t));
+ int (*)(mctelem_cookie_t));
int mctelem_has_deferred(unsigned int);
#endif
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/hvm.c Tue Jun 30 15:40:39 2009 +0100
@@ -1771,6 +1771,8 @@ void hvm_rdtsc_intercept(struct cpu_user
regs->edx = (uint32_t)(tsc >> 32);
}
+extern int intel_mce_rdmsr(u32 msr, u32 *lo, u32 *hi);
+extern int intel_mce_wrmsr(u32 msr, u64 value);
int hvm_msr_read_intercept(struct cpu_user_regs *regs)
{
uint32_t ecx = regs->ecx;
@@ -1779,6 +1781,8 @@ int hvm_msr_read_intercept(struct cpu_us
uint64_t *var_range_base, *fixed_range_base;
int index, mtrr;
uint32_t cpuid[4];
+ uint32_t lo, hi;
+ int ret;
var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges;
fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges;
@@ -1794,18 +1798,6 @@ int hvm_msr_read_intercept(struct cpu_us
case MSR_IA32_APICBASE:
msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
- break;
-
- case MSR_IA32_MCG_CAP:
- case MSR_IA32_MCG_STATUS:
- case MSR_IA32_MC0_STATUS:
- case MSR_IA32_MC1_STATUS:
- case MSR_IA32_MC2_STATUS:
- case MSR_IA32_MC3_STATUS:
- case MSR_IA32_MC4_STATUS:
- case MSR_IA32_MC5_STATUS:
- /* No point in letting the guest see real MCEs */
- msr_content = 0;
break;
case MSR_IA32_CR_PAT:
@@ -1858,7 +1850,17 @@ int hvm_msr_read_intercept(struct cpu_us
break;
default:
- return hvm_funcs.msr_read_intercept(regs);
+ ret = intel_mce_rdmsr(ecx, &lo, &hi);
+ if ( ret < 0 )
+ goto gp_fault;
+ else if ( ret )
+ {
+ msr_content = ((u64)hi << 32) | lo;
+ break;
+ }
+ /* ret == 0, This is not an MCE MSR, see other MSRs */
+ else if (!ret)
+ return hvm_funcs.msr_read_intercept(regs);
}
regs->eax = (uint32_t)msr_content;
@@ -1884,6 +1886,7 @@ int hvm_msr_write_intercept(struct cpu_u
struct vcpu *v = current;
int index, mtrr;
uint32_t cpuid[4];
+ int ret;
hvm_cpuid(1, &cpuid[0], &cpuid[1], &cpuid[2], &cpuid[3]);
mtrr = !!(cpuid[3] & bitmaskof(X86_FEATURE_MTRR));
@@ -1946,7 +1949,13 @@ int hvm_msr_write_intercept(struct cpu_u
break;
default:
- return hvm_funcs.msr_write_intercept(regs);
+ ret = intel_mce_wrmsr(ecx, msr_content);
+ if ( ret < 0 )
+ goto gp_fault;
+ else if ( ret )
+ break;
+ else if (!ret)
+ return hvm_funcs.msr_write_intercept(regs);
}
return X86EMUL_OKAY;
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/irq.c
--- a/xen/arch/x86/hvm/irq.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/irq.c Tue Jun 30 15:40:39 2009 +0100
@@ -326,6 +326,9 @@ struct hvm_intack hvm_vcpu_has_pending_i
if ( unlikely(v->nmi_pending) )
return hvm_intack_nmi;
+ if ( unlikely(v->mce_pending) )
+ return hvm_intack_mce;
+
if ( vlapic_accept_pic_intr(v) && plat->vpic[0].int_output )
return hvm_intack_pic(0);
@@ -345,6 +348,10 @@ struct hvm_intack hvm_vcpu_ack_pending_i
{
case hvm_intsrc_nmi:
if ( !test_and_clear_bool(v->nmi_pending) )
+ intack = hvm_intack_none;
+ break;
+ case hvm_intsrc_mce:
+ if ( !test_and_clear_bool(v->mce_pending) )
intack = hvm_intack_none;
break;
case hvm_intsrc_pic:
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/hvm/vmx/intr.c
--- a/xen/arch/x86/hvm/vmx/intr.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/hvm/vmx/intr.c Tue Jun 30 15:40:39 2009 +0100
@@ -157,6 +157,10 @@ asmlinkage void vmx_intr_assist(void)
{
vmx_inject_nmi();
}
+ else if ( intack.source == hvm_intsrc_mce )
+ {
+ vmx_inject_hw_exception(TRAP_machine_check, HVM_DELIVER_NO_ERROR_CODE);
+ }
else
{
HVMTRACE_2D(INJ_VIRQ, intack.vector, /*fake=*/ 0);
diff -r 00502df38143 -r 7bbbc57163d5 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/arch/x86/x86_64/traps.c Tue Jun 30 15:40:39 2009 +0100
@@ -309,12 +309,13 @@ unsigned long do_iret(void)
&& !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
vcpu_set_affinity(v, &v->cpu_affinity_tmp);
- /*Currently, only inject vMCE to DOM0.*/
+ /* inject vMCE to PV_Guest including DOM0. */
if (v->trap_priority >= VCPU_TRAP_NMI) {
- printk(KERN_DEBUG "MCE: Return from vMCE# trap!");
- if (d->domain_id == 0 && v->vcpu_id == 0) {
+ printk(KERN_DEBUG "MCE: Return from vMCE# trap!\n");
+ if ( v->vcpu_id == 0 ) {
if ( !d->arch.vmca_msrs.nr_injection ) {
- printk(KERN_WARNING "MCE: Ret from vMCE#, nr_injection is
0\n");
+ printk(KERN_WARNING "MCE: Ret from vMCE#, "
+ "No injection Node\n");
goto end;
}
diff -r 00502df38143 -r 7bbbc57163d5 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/include/asm-x86/domain.h Tue Jun 30 15:40:39 2009 +0100
@@ -210,7 +210,6 @@ struct p2m_domain;
* put into impact_header list. */
struct bank_entry {
struct list_head list;
- int32_t cpu;
uint16_t bank;
uint64_t mci_status;
uint64_t mci_addr;
diff -r 00502df38143 -r 7bbbc57163d5 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h Tue Jun 30 15:37:14 2009 +0100
+++ b/xen/include/asm-x86/hvm/hvm.h Tue Jun 30 15:40:39 2009 +0100
@@ -31,7 +31,8 @@ enum hvm_intsrc {
hvm_intsrc_none,
hvm_intsrc_pic,
hvm_intsrc_lapic,
- hvm_intsrc_nmi
+ hvm_intsrc_nmi,
+ hvm_intsrc_mce
};
struct hvm_intack {
uint8_t source; /* enum hvm_intsrc */
@@ -41,6 +42,7 @@ struct hvm_intack {
#define hvm_intack_pic(vec) ( (struct hvm_intack) { hvm_intsrc_pic, vec } )
#define hvm_intack_lapic(vec) ( (struct hvm_intack) { hvm_intsrc_lapic, vec } )
#define hvm_intack_nmi ( (struct hvm_intack) { hvm_intsrc_nmi, 2 } )
+#define hvm_intack_mce ( (struct hvm_intack) { hvm_intsrc_mce, 18 } )
enum hvm_intblk {
hvm_intblk_none, /* not blocked (deliverable) */
hvm_intblk_shadow, /* MOV-SS or STI shadow */
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|