[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH v18 14/16] x86/VPMU: NMI-based VPMU support



Add support for using NMIs as PMU interrupts to allow profiling hypervisor
when interrupts are disabled.

Most of processing is still performed by vpmu_do_interrupt(). However, since
certain operations are not NMI-safe we defer them to a softint that 
vpmu_do_interrupt()
will schedule:
* For PV guests that would be send_guest_vcpu_virq()
* For HVM guests it's VLAPIC accesses and hvm_get_segment_register() (the later
can be called in privileged profiling mode when the interrupted guest is an HVM 
one).

With send_guest_vcpu_virq() and hvm_get_segment_register() for PV(H) and vlapic
accesses for HVM moved to sofint, the only routines/macros that 
vpmu_do_interrupt()
calls in NMI mode are:
* memcpy()
* querying domain type (is_XX_domain())
* guest_cpu_user_regs()
* XLAT_cpu_user_regs()
* raise_softirq()
* vcpu_vpmu()
* vpmu_ops->arch_vpmu_save()
* vpmu_ops->do_interrupt()

The latter two only access PMU MSRs with {rd,wr}msrl() (not the _safe versions
which would not be NMI-safe).

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
---
 docs/misc/xen-command-line.markdown |   8 +-
 xen/arch/x86/hvm/svm/vpmu.c         |   3 +-
 xen/arch/x86/hvm/vmx/vpmu_core2.c   |   3 +-
 xen/arch/x86/hvm/vpmu.c             | 229 ++++++++++++++++++++++++++++--------
 xen/include/asm-x86/hvm/vpmu.h      |   4 +-
 xen/include/asm-x86/softirq.h       |   3 +-
 6 files changed, 193 insertions(+), 57 deletions(-)

diff --git a/docs/misc/xen-command-line.markdown 
b/docs/misc/xen-command-line.markdown
index bc316be..0ab1188 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -1330,11 +1330,11 @@ Use Virtual Processor ID support if available.  This 
prevents the need for TLB
 flushes on VM entry and exit, increasing performance.
 
 ### vpmu
-> `= ( bts )`
+> `= ( [nmi,][bts] )`
 
 > Default: `off`
 
-Switch on the virtualized performance monitoring unit for HVM guests.
+Switch on the virtualized performance monitoring unit.
 
 If the current cpu isn't supported a message like
 'VPMU: Initialization failed. ...'
@@ -1348,6 +1348,10 @@ feature is switched on on Intel processors supporting 
this feature.
 
 Note that if **watchdog** option is also specified vpmu will be turned off.
 
+If 'vpmu=nmi' is specified the PMU interrupt will cause an NMI instead of a
+regular vector interrupt (which is the default). This can be useful for 
sampling
+hypervisor code that is executed with interrupts disabled.
+
 *Warning:*
 As the BTS virtualisation is not 100% safe and because of the nehalem quirk
 don't use the vpmu flag on production systems with Intel cpus!
diff --git a/xen/arch/x86/hvm/svm/vpmu.c b/xen/arch/x86/hvm/svm/vpmu.c
index 68113c7..7ddce33 100644
--- a/xen/arch/x86/hvm/svm/vpmu.c
+++ b/xen/arch/x86/hvm/svm/vpmu.c
@@ -168,7 +168,7 @@ static void amd_vpmu_unset_msr_bitmap(struct vcpu *v)
     msr_bitmap_off(vpmu);
 }
 
-static int amd_vpmu_do_interrupt(struct cpu_user_regs *regs)
+static int amd_vpmu_do_interrupt(const struct cpu_user_regs *regs)
 {
     return 1;
 }
@@ -220,6 +220,7 @@ static inline void context_save(struct vpmu_struct *vpmu)
         rdmsrl(counters[i], counter_regs[i]);
 }
 
+/* Must be NMI-safe */
 static int amd_vpmu_save(struct vpmu_struct *vpmu)
 {
     struct vcpu *v;
diff --git a/xen/arch/x86/hvm/vmx/vpmu_core2.c 
b/xen/arch/x86/hvm/vmx/vpmu_core2.c
index 8067d83..0c7fd74 100644
--- a/xen/arch/x86/hvm/vmx/vpmu_core2.c
+++ b/xen/arch/x86/hvm/vmx/vpmu_core2.c
@@ -305,6 +305,7 @@ static inline void __core2_vpmu_save(struct vpmu_struct 
*vpmu)
         rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, core2_vpmu_cxt->global_status);
 }
 
+/* Must be NMI-safe */
 static int core2_vpmu_save(struct vpmu_struct *vpmu)
 {
     struct vcpu *v = vpmu_vcpu(vpmu);
@@ -720,7 +721,7 @@ static void core2_vpmu_dump(const struct vcpu *v)
     }
 }
 
-static int core2_vpmu_do_interrupt(struct cpu_user_regs *regs)
+static int core2_vpmu_do_interrupt(const struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
     u64 msr_content;
diff --git a/xen/arch/x86/hvm/vpmu.c b/xen/arch/x86/hvm/vpmu.c
index 651cb00..cdead13 100644
--- a/xen/arch/x86/hvm/vpmu.c
+++ b/xen/arch/x86/hvm/vpmu.c
@@ -56,29 +56,47 @@ static bool_t __read_mostly vpmu_disabled = 1;
 static void parse_vpmu_param(char *s);
 custom_param("vpmu", parse_vpmu_param);
 
+static void pmu_softnmi(void);
+
 static DEFINE_PER_CPU(struct vcpu *, last_vcpu);
+static DEFINE_PER_CPU(struct vcpu *, sampled_vcpu);
+
+static uint32_t __read_mostly vpmu_interrupt_type = PMU_APIC_VECTOR;
 
 static void __init parse_vpmu_param(char *s)
 {
-    switch ( parse_bool(s) )
-    {
-    case 0:
-        break;
-    default:
-        if ( !strcmp(s, "bts") )
-            vpmu_features |= XENPMU_FEATURE_INTEL_BTS;
-        else if ( *s )
+    char *ss;
+
+    vpmu_mode = XENPMU_MODE_SELF;
+    vpmu_disabled = 0;
+    if (*s == '\0')
+        return;
+
+    do {
+        ss = strchr(s, ',');
+        if ( ss )
+            *ss = '\0';
+
+        switch  ( parse_bool(s) )
         {
-            printk("VPMU: unknown flag: %s - vpmu disabled!\n", s);
-            break;
+        default:
+            if ( !strcmp(s, "nmi") )
+                vpmu_interrupt_type = APIC_DM_NMI;
+            else if ( !strcmp(s, "bts") )
+                vpmu_features |= XENPMU_FEATURE_INTEL_BTS;
+            else
+            {
+                printk("VPMU: unknown flag: %s - vpmu disabled!\n", s);
+        case 0:
+                vpmu_mode = XENPMU_MODE_OFF;
+                vpmu_disabled = 1;
+        case 1:
+                return;
+            }
         }
-        /* fall through */
-    case 1:
-        /* Default VPMU mode */
-        vpmu_mode = XENPMU_MODE_SELF;
-        vpmu_disabled = 0;
-        break;
-    }
+
+        s = ss + 1;
+    } while ( ss );
 }
 
 void vpmu_lvtpc_update(uint32_t val)
@@ -92,7 +110,7 @@ void vpmu_lvtpc_update(uint32_t val)
     curr = current;
     vpmu = vcpu_vpmu(curr);
 
-    vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | (val & APIC_LVT_MASKED);
+    vpmu->hw_lapic_lvtpc = vpmu_interrupt_type | (val & APIC_LVT_MASKED);
 
     /* Postpone APIC updates for PV(H) guests if PMU interrupt is pending */
     if ( is_hvm_vcpu(curr) || !vpmu->xenpmu_data ||
@@ -100,6 +118,30 @@ void vpmu_lvtpc_update(uint32_t val)
         apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
 }
 
+static void vpmu_send_interrupt(struct vcpu *v)
+{
+    struct vlapic *vlapic;
+    u32 vlapic_lvtpc;
+
+    ASSERT(is_hvm_vcpu(v));
+
+    vlapic = vcpu_vlapic(v);
+    if ( !is_vlapic_lvtpc_enabled(vlapic) )
+        return;
+
+    vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC);
+
+    switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) )
+    {
+    case APIC_MODE_FIXED:
+        vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0);
+        break;
+    case APIC_MODE_NMI:
+        v->nmi_pending = 1;
+        break;
+    }
+}
+
 int vpmu_do_msr(unsigned int msr, uint64_t *msr_content,
                 uint64_t supported, bool_t is_write)
 {
@@ -157,7 +199,7 @@ static struct vcpu *choose_hwdom_vcpu(void)
     return hardware_domain->vcpu[idx];
 }
 
-void vpmu_do_interrupt(struct cpu_user_regs *regs)
+int vpmu_do_interrupt(const struct cpu_user_regs *regs)
 {
     struct vcpu *sampled = current, *sampling;
     struct vpmu_struct *vpmu;
@@ -171,7 +213,7 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs)
     {
         sampling = choose_hwdom_vcpu();
         if ( !sampling )
-            return;
+            return 0;
     }
     else
         sampling = sampled;
@@ -185,15 +227,15 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs)
         uint32_t domid;
 
         if ( !vpmu->xenpmu_data )
-            return;
+            return 0;
 
         if ( is_pvh_vcpu(sampling) &&
              !(vpmu_mode & XENPMU_MODE_ALL) &&
              !vpmu->arch_vpmu_ops->do_interrupt(regs) )
-            return;
+            return 0;
 
         if ( *flags & PMU_CACHED )
-            return;
+            return 1;
 
         /* PV guest will be reading PMU MSRs from xenpmu_data */
         vpmu_set(vpmu, VPMU_CONTEXT_SAVE | VPMU_CONTEXT_LOADED);
@@ -260,15 +302,20 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs)
             }
             else
             {
-                struct segment_register seg;
-
-                hvm_get_segment_register(sampled, x86_seg_cs, &seg);
-                r->cs = seg.sel;
-                hvm_get_segment_register(sampled, x86_seg_ss, &seg);
-                r->ss = seg.sel;
-                r->cpl = seg.attr.fields.dpl;
                 if ( !(sampled->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
                     *flags |= PMU_SAMPLE_REAL;
+
+                /* Unsafe in NMI context, defer to softint later. */
+                if ( vpmu_interrupt_type != APIC_DM_NMI )
+                {
+                    struct segment_register seg;
+
+                    hvm_get_segment_register(sampled, x86_seg_cs, &seg);
+                    r->cs = seg.sel;
+                    hvm_get_segment_register(sampled, x86_seg_ss, &seg);
+                    r->ss = seg.sel;
+                    r->cpl = seg.attr.fields.dpl;
+                }
             }
         }
 
@@ -280,35 +327,37 @@ void vpmu_do_interrupt(struct cpu_user_regs *regs)
         apic_write(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
         *flags |= PMU_CACHED;
 
-        send_guest_vcpu_virq(sampling, VIRQ_XENPMU);
+        if ( vpmu_interrupt_type == APIC_DM_NMI )
+        {
+            this_cpu(sampled_vcpu) = sampled;
+            raise_softirq(PMU_SOFTIRQ);
+        }
+        else
+            send_guest_vcpu_virq(sampling, VIRQ_XENPMU);
 
-        return;
+        return 1;
     }
 
     if ( vpmu->arch_vpmu_ops )
     {
-        struct vlapic *vlapic = vcpu_vlapic(sampling);
-        u32 vlapic_lvtpc;
-
         /* We don't support (yet) HVM dom0 */
         ASSERT(sampling == sampled);
 
-        if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) ||
-             !is_vlapic_lvtpc_enabled(vlapic) )
-            return;
-
-        vlapic_lvtpc = vlapic_get_reg(vlapic, APIC_LVTPC);
+        if ( !vpmu->arch_vpmu_ops->do_interrupt(regs) )
+            return 0;
 
-        switch ( GET_APIC_DELIVERY_MODE(vlapic_lvtpc) )
+        if ( vpmu_interrupt_type == APIC_DM_NMI )
         {
-        case APIC_MODE_FIXED:
-            vlapic_set_irq(vlapic, vlapic_lvtpc & APIC_VECTOR_MASK, 0);
-            break;
-        case APIC_MODE_NMI:
-            sampling->nmi_pending = 1;
-            break;
+            this_cpu(sampled_vcpu) = sampled;
+            raise_softirq(PMU_SOFTIRQ);
         }
+        else
+            vpmu_send_interrupt(sampling);
+
+        return 1;
     }
+
+    return 0;
 }
 
 void vpmu_do_cpuid(unsigned int input,
@@ -336,6 +385,9 @@ static void vpmu_save_force(void *arg)
     vpmu_reset(vpmu, VPMU_CONTEXT_SAVE);
 
     per_cpu(last_vcpu, smp_processor_id()) = NULL;
+
+    /* Make sure there are no outstanding PMU NMIs */
+    pmu_softnmi();
 }
 
 void vpmu_save(struct vpmu_struct *vpmu)
@@ -352,7 +404,10 @@ void vpmu_save(struct vpmu_struct *vpmu)
         if ( vpmu->arch_vpmu_ops->arch_vpmu_save(vpmu) )
             vpmu_reset(vpmu, VPMU_CONTEXT_LOADED);
 
-    apic_write(APIC_LVTPC, PMU_APIC_VECTOR | APIC_LVT_MASKED);
+    apic_write(APIC_LVTPC, vpmu_interrupt_type | APIC_LVT_MASKED);
+
+    /* Make sure there are no outstanding PMU NMIs */
+    pmu_softnmi();
 }
 
 void vpmu_load(struct vpmu_struct *vpmu)
@@ -403,6 +458,9 @@ void vpmu_load(struct vpmu_struct *vpmu)
           (vpmu->xenpmu_data->pmu.pmu_flags & PMU_CACHED)) )
         return;
 
+    /* Make sure there are no outstanding PMU NMIs from previous vcpu */
+    pmu_softnmi();
+
     if ( vpmu->arch_vpmu_ops && vpmu->arch_vpmu_ops->arch_vpmu_load )
     {
         apic_write_around(APIC_LVTPC, vpmu->hw_lapic_lvtpc);
@@ -426,7 +484,7 @@ void vpmu_initialise(struct vcpu *v)
         vpmu_destroy(v);
     vpmu_clear(vpmu);
     vpmu->context = NULL;
-    vpmu->hw_lapic_lvtpc = PMU_APIC_VECTOR | APIC_LVT_MASKED;
+    vpmu->hw_lapic_lvtpc = vpmu_interrupt_type | APIC_LVT_MASKED;
 
     switch ( vendor )
     {
@@ -487,6 +545,55 @@ void vpmu_destroy(struct vcpu *v)
     }
 }
 
+/* Process the softirq set by PMU NMI handler */
+static void pmu_softnmi(void)
+{
+    unsigned int cpu = smp_processor_id();
+    struct vcpu *v, *sampled = per_cpu(sampled_vcpu, cpu);
+
+    if ( sampled == NULL )
+        return;
+
+    per_cpu(sampled_vcpu, cpu) = NULL;
+
+    if ( (vpmu_mode & XENPMU_MODE_ALL) ||
+         (sampled->domain->domain_id >= DOMID_FIRST_RESERVED) )
+    {
+            v = choose_hwdom_vcpu();
+            if ( !v )
+                return;
+    }
+    else
+    {
+        if ( is_hvm_vcpu(sampled) )
+        {
+            vpmu_send_interrupt(sampled);
+            return;
+        }
+        v = sampled;
+    }
+
+    if ( has_hvm_container_vcpu(sampled) )
+    {
+        struct segment_register seg;
+        struct xen_pmu_arch *pmu = &v->arch.vpmu.xenpmu_data->pmu;
+        struct xen_pmu_regs *r = &pmu->r.regs;
+
+        hvm_get_segment_register(sampled, x86_seg_cs, &seg);
+        r->cs = seg.sel;
+        hvm_get_segment_register(sampled, x86_seg_ss, &seg);
+        r->ss = seg.sel;
+        r->cpl = seg.attr.fields.dpl;
+    }
+
+    send_guest_vcpu_virq(v, VIRQ_XENPMU);
+}
+
+int pmu_nmi_interrupt(const struct cpu_user_regs *regs, int cpu)
+{
+    return vpmu_do_interrupt(regs);
+}
+
 static int pvpmu_init(struct domain *d, xen_pmu_params_t *params)
 {
     struct vcpu *v;
@@ -502,6 +609,7 @@ static int pvpmu_init(struct domain *d, xen_pmu_params_t 
*params)
          (d->vcpu[params->vcpu] == NULL) )
         return -EINVAL;
 
+    v = d->vcpu[params->vcpu];
     if ( v->arch.vpmu.xenpmu_data )
         return -EINVAL;
 
@@ -515,7 +623,6 @@ static int pvpmu_init(struct domain *d, xen_pmu_params_t 
*params)
         return -EINVAL;
     }
 
-    v = d->vcpu[params->vcpu];
     vpmu = vcpu_vpmu(v);
     spin_lock(&vpmu->vpmu_lock);
 
@@ -832,6 +939,21 @@ static int __init vpmu_init(void)
         return 0;
     }
 
+    if ( vpmu_interrupt_type == APIC_DM_NMI )
+    {
+        if ( reserve_lapic_nmi() != 0 )
+        {
+            printk(XENLOG_WARNING "VPMU: Can't reserve NMI, will use"
+                                  " APIC vector 0x%x\n", PMU_APIC_VECTOR);
+            vpmu_interrupt_type = PMU_APIC_VECTOR;
+        }
+        else
+        {
+            set_nmi_callback(pmu_nmi_interrupt);
+            open_softirq(PMU_SOFTIRQ, pmu_softnmi);
+        }
+    }
+
     switch ( vendor )
     {
     case X86_VENDOR_AMD:
@@ -853,7 +975,14 @@ static int __init vpmu_init(void)
         printk(XENLOG_INFO "VPMU: version " __stringify(XENPMU_VER_MAJ) "."
                __stringify(XENPMU_VER_MIN) "\n");
     else
+    {
+        if ( vpmu_interrupt_type == APIC_DM_NMI )
+        {
+            unset_nmi_callback();
+            release_lapic_nmi();
+        }
         vpmu_disabled = 1;
+    }
 
     return 0;
 }
diff --git a/xen/include/asm-x86/hvm/vpmu.h b/xen/include/asm-x86/hvm/vpmu.h
index 2c888cc..ed5dc8c 100644
--- a/xen/include/asm-x86/hvm/vpmu.h
+++ b/xen/include/asm-x86/hvm/vpmu.h
@@ -53,7 +53,7 @@ struct arch_vpmu_ops {
     int (*do_wrmsr)(unsigned int msr, uint64_t msr_content,
                     uint64_t supported);
     int (*do_rdmsr)(unsigned int msr, uint64_t *msr_content);
-    int (*do_interrupt)(struct cpu_user_regs *regs);
+    int (*do_interrupt)(const struct cpu_user_regs *regs);
     void (*do_cpuid)(unsigned int input,
                      unsigned int *eax, unsigned int *ebx,
                      unsigned int *ecx, unsigned int *edx);
@@ -102,7 +102,7 @@ static inline bool_t vpmu_are_all_set(const struct 
vpmu_struct *vpmu,
 void vpmu_lvtpc_update(uint32_t val);
 int vpmu_do_msr(unsigned int msr, uint64_t *msr_content,
                 uint64_t supported, bool_t is_write);
-void vpmu_do_interrupt(struct cpu_user_regs *regs);
+int vpmu_do_interrupt(const struct cpu_user_regs *regs);
 void vpmu_do_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
                                        unsigned int *ecx, unsigned int *edx);
 void vpmu_initialise(struct vcpu *v);
diff --git a/xen/include/asm-x86/softirq.h b/xen/include/asm-x86/softirq.h
index ec787d6..fca110f 100644
--- a/xen/include/asm-x86/softirq.h
+++ b/xen/include/asm-x86/softirq.h
@@ -8,7 +8,8 @@
 #define MACHINE_CHECK_SOFTIRQ  (NR_COMMON_SOFTIRQS + 3)
 #define PCI_SERR_SOFTIRQ       (NR_COMMON_SOFTIRQS + 4)
 #define HVM_DPCI_SOFTIRQ       (NR_COMMON_SOFTIRQS + 5)
-#define NR_ARCH_SOFTIRQS       6
+#define PMU_SOFTIRQ            (NR_COMMON_SOFTIRQS + 6)
+#define NR_ARCH_SOFTIRQS       7
 
 bool_t arch_skip_send_event_check(unsigned int cpu);
 
-- 
1.8.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.