[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH v4 14/28] x86/vvtd: Handle interrupt translation faults



On Fri, Nov 17, 2017 at 02:22:21PM +0800, Chao Gao wrote:
> Interrupt translation faults are non-recoverable fault. When faults
> are triggered, it needs to populate fault info to Fault Recording
> Registers and inject msi interrupt to notify guest IOMMU driver
> to deal with faults.
> 
> This patch emulates hardware's handling interrupt translation
> faults (more information about the process can be found in VT-d spec,
> chipter "Translation Faults", section "Non-Recoverable Fault
> Reporting" and section "Non-Recoverable Logging").
> Specifically, viommu_record_fault() records the fault information and
> viommu_report_non_recoverable_fault() reports faults to software.
> Currently, only Primary Fault Logging is supported and the Number of
> Fault-recording Registers is 1.
> 
> Signed-off-by: Chao Gao <chao.gao@xxxxxxxxx>
> Signed-off-by: Lan Tianyu <tianyu.lan@xxxxxxxxx>
> 
> ---
> v4:
>  - introduce a lock to protect fault-event related regs
> ---
>  xen/drivers/passthrough/vtd/iommu.h |  51 ++++++-
>  xen/drivers/passthrough/vtd/vvtd.c  | 288 
> +++++++++++++++++++++++++++++++++++-
>  2 files changed, 333 insertions(+), 6 deletions(-)
> 
> diff --git a/xen/drivers/passthrough/vtd/iommu.h 
> b/xen/drivers/passthrough/vtd/iommu.h
> index 82edd2a..dc2df75 100644
> --- a/xen/drivers/passthrough/vtd/iommu.h
> +++ b/xen/drivers/passthrough/vtd/iommu.h
> @@ -196,26 +196,67 @@
>  #define DMA_CCMD_CAIG_MASK(x) (((u64)x) & ((u64) 0x3 << 59))
>  
>  /* FECTL_REG */
> -#define DMA_FECTL_IM        ((uint32_t)1 << 31)
> +#define DMA_FECTL_IM_SHIFT  31
> +#define DMA_FECTL_IP_SHIFT  30
> +#define DMA_FECTL_IM        ((uint32_t)1 << DMA_FECTL_IM_SHIFT)
> +#define DMA_FECTL_IP        ((uint32_t)1 << DMA_FECTL_IP_SHIFT)
>  
>  /* FSTS_REG */
> -#define DMA_FSTS_PFO        ((uint32_t)1 << 0)
> -#define DMA_FSTS_PPF        ((uint32_t)1 << 1)
> +#define DMA_FSTS_PFO_SHIFT  0
> +#define DMA_FSTS_PPF_SHIFT  1
> +#define DMA_FSTS_PRO_SHIFT  7
> +
> +#define DMA_FSTS_PFO        ((uint32_t)1 << DMA_FSTS_PFO_SHIFT)
> +#define DMA_FSTS_PPF        ((uint32_t)1 << DMA_FSTS_PPF_SHIFT)
>  #define DMA_FSTS_AFO        ((uint32_t)1 << 2)
>  #define DMA_FSTS_APF        ((uint32_t)1 << 3)
>  #define DMA_FSTS_IQE        ((uint32_t)1 << 4)
>  #define DMA_FSTS_ICE        ((uint32_t)1 << 5)
>  #define DMA_FSTS_ITE        ((uint32_t)1 << 6)
> -#define DMA_FSTS_FAULTS    DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | 
> DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE
> +#define DMA_FSTS_PRO        ((uint32_t)1 << DMA_FSTS_PRO_SHIFT)
> +#define DMA_FSTS_FAULTS     (DMA_FSTS_PFO | DMA_FSTS_PPF | DMA_FSTS_AFO | \
> +                             DMA_FSTS_APF | DMA_FSTS_IQE | DMA_FSTS_ICE | \
> +                             DMA_FSTS_ITE | DMA_FSTS_PRO)
> +#define DMA_FSTS_RW1CS      (DMA_FSTS_PFO | DMA_FSTS_AFO | DMA_FSTS_APF | \
> +                             DMA_FSTS_IQE | DMA_FSTS_ICE | DMA_FSTS_ITE | \
> +                             DMA_FSTS_PRO)
>  #define dma_fsts_fault_record_index(s) (((s) >> 8) & 0xff)
>  
>  /* FRCD_REG, 32 bits access */
> -#define DMA_FRCD_F (((u64)1) << 31)
> +#define DMA_FRCD_LEN            0x10
> +#define DMA_FRCD2_OFFSET        0x8
> +#define DMA_FRCD3_OFFSET        0xc
> +#define DMA_FRCD_F_SHIFT        31
> +#define DMA_FRCD_F ((u64)1 << DMA_FRCD_F_SHIFT)
>  #define dma_frcd_type(d) ((d >> 30) & 1)
>  #define dma_frcd_fault_reason(c) (c & 0xff)
>  #define dma_frcd_source_id(c) (c & 0xffff)
>  #define dma_frcd_page_addr(d) (d & (((u64)-1) << 12)) /* low 64 bit */
>  
> +struct vtd_fault_record_register
> +{
> +    union {
> +        struct {
> +            uint64_t lo;
> +            uint64_t hi;
> +        } bits;
> +        struct {
> +            uint64_t rsvd0          :12,
> +                     fault_info     :52;
> +            uint64_t source_id      :16,
> +                     rsvd1          :9,
> +                     pmr            :1,  /* Privilege Mode Requested */
> +                     exe            :1,  /* Execute Permission Requested */
> +                     pasid_p        :1,  /* PASID Present */
> +                     fault_reason   :8,  /* Fault Reason */
> +                     pasid_val      :20, /* PASID Value */
> +                     addr_type      :2,  /* Address Type */
> +                     type           :1,  /* Type. (0) Write (1) 
> Read/AtomicOp */
> +                     fault          :1;  /* Fault */
> +        } fields;
> +    };
> +};
> +
>  /* Interrupt remapping transition faults */
>  #define VTD_FR_IR_REQ_RSVD      0x20
>  #define VTD_FR_IR_INDEX_OVER    0x21
> diff --git a/xen/drivers/passthrough/vtd/vvtd.c 
> b/xen/drivers/passthrough/vtd/vvtd.c
> index d3dec01..83805d1 100644
> --- a/xen/drivers/passthrough/vtd/vvtd.c
> +++ b/xen/drivers/passthrough/vtd/vvtd.c
> @@ -43,6 +43,7 @@
>  struct hvm_hw_vvtd {
>      bool eim_enabled;
>      bool intremap_enabled;
> +    uint32_t fault_index;
>  
>      /* Interrupt remapping table base gfn and the max of entries */
>      uint16_t irt_max_entry;
> @@ -58,6 +59,12 @@ struct vvtd {
>      struct domain *domain;
>      /* # of in-flight interrupts */
>      atomic_t inflight_intr;
> +    /*
> +     * This lock protects fault-event related registers (DMAR_FEXXX_REG).
> +     * It's used for draining in-flight fault events before responding
> +     * guest's programming to those registers.
> +     */
> +    spinlock_t fe_lock;

I still think almost if not all of the vvtd helper functions should be
mutually exclusive (ie: locked), not only the fault-event related
registers. I guess Linux or other OSes already serialize access to the
vIOMMU somehow, so your not seeing any errors. But I'm quite sure
things will fail in weird ways if a malicious guests starts to
concurrently write to different vIOMMU registers.

>  
>      struct hvm_hw_vvtd hw;
>      void *irt_base;
> @@ -87,6 +94,21 @@ boolean_runtime_param("viommu_verbose", viommu_verbose);
>  #endif
>  
>  #define VVTD_REG_POS(vvtd, offset) &(vvtd->hw.regs[offset/sizeof(uint32_t)])
> +static inline int vvtd_test_and_set_bit(struct vvtd *vvtd, uint32_t reg, int 
> nr)
> +{
> +    return test_and_set_bit(nr, VVTD_REG_POS(vvtd, reg));
> +}
> +
> +static inline int vvtd_test_and_clear_bit(struct vvtd *vvtd, uint32_t reg,
> +                                          int nr)
> +{
> +    return test_and_clear_bit(nr, VVTD_REG_POS(vvtd, reg));
> +}

So for set and clear bit you use the non locked variants (prefixed by
__), and here you use the locked variants of test and set/clear. Is
there any reason for this? I would expect locked/unlocked bitops to be
used consistently for dealing with the registers unless there's a
specific reason not to do so.

> +
> +static inline int vvtd_test_bit(struct vvtd *vvtd, uint32_t reg, int nr)
> +{
> +    return test_bit(nr, VVTD_REG_POS(vvtd, reg));
> +}
>  
>  static inline void vvtd_set_bit(struct vvtd *vvtd, uint32_t reg, int nr)
>  {
> @@ -238,6 +260,30 @@ static int vvtd_delivery(struct domain *d, uint8_t 
> vector,
>      return 0;
>  }
>  
> +static void vvtd_generate_interrupt(const struct vvtd *vvtd, uint64_t addr,
> +                                    uint32_t data)
> +{
> +    bool dm = addr & MSI_ADDR_DESTMODE_MASK;

Please use MASK_EXTR here. Also destmode is usually treated as an
uint8_t in the rest of the Xen code (see vmsi_deliver). I would
probably keep using uint8_t just for consistency with the rest of the
code.

> +    uint32_t dest = MASK_EXTR(addr, MSI_ADDR_DEST_ID_MASK);
> +    uint8_t dlm = MASK_EXTR(data, MSI_DATA_DELIVERY_MODE_MASK);
> +    uint8_t tm = MASK_EXTR(data, MSI_DATA_TRIGGER_MASK);
> +    uint8_t vector = data & MSI_DATA_VECTOR_MASK;

MASK_EXTR please.

> +
> +    vvtd_debug("d%d: generating msi %lx %x\n", vvtd->domain->domain_id, addr,
> +               data);
> +
> +    if ( vvtd->hw.eim_enabled )
> +        dest |= (addr >> 40) << 8;

This 40 and 8 look like magic numbers to me, but it's liekly me
missing something. Any reason not to use addr >> 32 directly? In any
case I would really appreciate if you could add defines for those
and/or comments.

> +
> +    vvtd_delivery(vvtd->domain, vector, dest, dm, dlm, tm);
> +}
> +
> +static void vvtd_notify_fault(const struct vvtd *vvtd)
> +{
> +    vvtd_generate_interrupt(vvtd, vvtd_get_reg_quad(vvtd, DMAR_FEADDR_REG),
> +                            vvtd_get_reg(vvtd, DMAR_FEDATA_REG));
> +}
> +
>  /* Computing the IRTE index for a given interrupt request. When success, 
> return
>   * 0 and set index to reference the corresponding IRTE. Otherwise, return < 
> 0,
>   * i.e. -1 when the irq request isn't an remapping format.
> @@ -290,6 +336,198 @@ static inline uint32_t irte_dest(struct vvtd *vvtd, 
> uint32_t dest)
>                                  : MASK_EXTR(dest, IRTE_xAPIC_DEST_MASK);
>  }
>  
> +static void vvtd_report_non_recoverable_fault(struct vvtd *vvtd, int reason)
> +{
> +    uint32_t fsts = vvtd_get_reg(vvtd, DMAR_FSTS_REG);
> +
> +    vvtd_set_bit(vvtd, DMAR_FSTS_REG, reason);

test_and_set?

> +
> +    /*
> +     * Accoroding to VT-d spec "Non-Recoverable Fault Event" chapter, if
> +     * there are any previously reported interrupt conditions that are yet to
> +     * be sevices by software, the Fault Event interrrupt is not generated.
> +     */
> +    if ( fsts & DMA_FSTS_FAULTS )
> +        return;
> +
> +    vvtd_set_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
> +    if ( !vvtd_test_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IM_SHIFT) )
> +    {
> +        vvtd_notify_fault(vvtd);
> +        vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
> +    }
> +}
> +
> +static void vvtd_update_ppf(struct vvtd *vvtd)
> +{
> +    int i;

unsigned int.

> +    uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG);
> +    unsigned int base = cap_fault_reg_offset(cap);
> +
> +    for ( i = 0; i < cap_num_fault_regs(cap); i++ )
> +    {
> +        if ( vvtd_test_bit(vvtd, base + i * DMA_FRCD_LEN + DMA_FRCD3_OFFSET,
> +                           DMA_FRCD_F_SHIFT) )
> +        {
> +            vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_PPF_SHIFT);
> +            return;
> +        }
> +    }
> +    /*
> +     * No Primary Fault is in Fault Record Registers, thus clear PPF bit in
> +     * FSTS.
> +     */
> +    vvtd_clear_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_PPF_SHIFT);
> +
> +    /* If no fault is in FSTS, clear pending bit in FECTL. */
> +    if ( !(vvtd_get_reg(vvtd, DMAR_FSTS_REG) & DMA_FSTS_FAULTS) )
> +        vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
> +}
> +
> +/*
> + * Commit a fault to emulated Fault Record Registers.
> + */
> +static void vvtd_commit_frcd(struct vvtd *vvtd, int idx,
> +                             const struct vtd_fault_record_register *frcd)
> +{
> +    unsigned int base = cap_fault_reg_offset(
> +                            vvtd_get_reg_quad(vvtd, DMAR_CAP_REG));
> +
> +    vvtd_set_reg_quad(vvtd, base + idx * DMA_FRCD_LEN, frcd->bits.lo);
> +    vvtd_set_reg_quad(vvtd, base + idx * DMA_FRCD_LEN + 8, frcd->bits.hi);
> +    vvtd_update_ppf(vvtd);
> +}
> +
> +/*
> + * Allocate a FRCD for the caller. If success, return the FRI. Or, return -1
> + * when failure.
> + */
> +static int vvtd_alloc_frcd(struct vvtd *vvtd)

What's the maximum value of FRCD according to the spec? Will it fit in
an int?

> +{
> +    int prev;
> +    uint64_t cap = vvtd_get_reg_quad(vvtd, DMAR_CAP_REG);
> +    unsigned int base = cap_fault_reg_offset(cap);
> +
> +    /* Set the F bit to indicate the FRCD is in use. */
> +    if ( !vvtd_test_and_set_bit(vvtd,
> +                                base + vvtd->hw.fault_index * DMA_FRCD_LEN +
> +                                DMA_FRCD3_OFFSET, DMA_FRCD_F_SHIFT) )
> +    {
> +        prev = vvtd->hw.fault_index;

prev can be declared inside the if:

    unsigned int prev = vvtd->hw.fault_index;

Also prev is used only once, so I think you can just get rid of it.

> +        vvtd->hw.fault_index = (prev + 1) % cap_num_fault_regs(cap);
> +        return vvtd->hw.fault_index;
> +    }

Newline.

> +    return -ENOMEM;
> +}
> +
> +static void vvtd_free_frcd(struct vvtd *vvtd, int i)
> +{
> +    unsigned int base = cap_fault_reg_offset(
> +                            vvtd_get_reg_quad(vvtd, DMAR_CAP_REG));
> +
> +    vvtd_clear_bit(vvtd, base + i * DMA_FRCD_LEN + DMA_FRCD3_OFFSET,
> +                   DMA_FRCD_F_SHIFT);
> +}
> +
> +static int vvtd_record_fault(struct vvtd *vvtd,
> +                             const struct arch_irq_remapping_request 
> *request,
> +                             int reason)
> +{
> +    struct vtd_fault_record_register frcd;
> +    int fault_index;

unsigned int maybe, see comments above.

> +    uint32_t irt_index;
> +
> +    spin_lock(&vvtd->fe_lock);
> +    switch(reason)
> +    {
> +    case VTD_FR_IR_REQ_RSVD:
> +    case VTD_FR_IR_INDEX_OVER:
> +    case VTD_FR_IR_ENTRY_P:
> +    case VTD_FR_IR_ROOT_INVAL:
> +    case VTD_FR_IR_IRTE_RSVD:
> +    case VTD_FR_IR_REQ_COMPAT:
> +    case VTD_FR_IR_SID_ERR:
> +        if ( vvtd_test_bit(vvtd, DMAR_FSTS_REG, DMA_FSTS_PFO_SHIFT) )
> +            goto out;
> +
> +        /* No available Fault Record means Fault overflowed */
> +        fault_index = vvtd_alloc_frcd(vvtd);
> +        if ( fault_index < 0 )
> +        {
> +            vvtd_report_non_recoverable_fault(vvtd, DMA_FSTS_PFO_SHIFT);
> +            goto out;
> +        }
> +        memset(&frcd, 0, sizeof(frcd));

Given the fact that frcd has not padding you can initialize it at
declaration using:

struct vtd_fault_record_register frcd = { };

> +        frcd.fields.fault_reason = reason;
> +        if ( irq_remapping_request_index(request, &irt_index) )
> +            goto out;
> +        frcd.fields.fault_info = irt_index;
> +        frcd.fields.source_id = request->source_id;
> +        frcd.fields.fault = 1;
> +        vvtd_commit_frcd(vvtd, fault_index, &frcd);
> +        break;
> +
> +    default:
> +        vvtd_debug("d%d: can't handle vvtd fault (reason 0x%x)",
> +                   vvtd->domain->domain_id, reason);
> +        break;
> +    }
> +
> + out:
> +    spin_unlock(&vvtd->fe_lock);
> +    return X86EMUL_OKAY;

I'm not sure why this function needs to return any value given it's
current usage, and in any case since it's not an emulation handler it
shouldn't use X86EMUL_* values at all.

> +}
> +
> +static int vvtd_write_frcd3(struct vvtd *vvtd, uint32_t val)
> +{
> +    /* Writing a 1 means clear fault */
> +    if ( val & DMA_FRCD_F )
> +    {
> +        vvtd_free_frcd(vvtd, 0);
> +        vvtd_update_ppf(vvtd);
> +    }
> +    return X86EMUL_OKAY;

Same here, I don't see the point in returning a value, and certainly
it shouldn't be X86EMUL_* in any case.

> +}
> +
> +static void vvtd_write_fectl(struct vvtd *vvtd, uint32_t val)
> +{
> +    /*
> +     * Only DMA_FECTL_IM bit is writable. Generate pending event when unmask.
> +     */
> +    if ( !(val & DMA_FECTL_IM) )
> +    {
> +        /* Clear IM */
> +        vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IM_SHIFT);
> +        if ( vvtd_test_and_clear_bit(vvtd, DMAR_FECTL_REG, 
> DMA_FECTL_IP_SHIFT) )
> +            vvtd_notify_fault(vvtd);
> +    }
> +    else
> +        vvtd_set_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IM_SHIFT);
> +}
> +
> +static void vvtd_write_fsts(struct vvtd *vvtd, uint32_t val)
> +{
> +    int i, max_fault_index = DMA_FSTS_PRO_SHIFT;
> +    uint64_t bits_to_clear = val & DMA_FSTS_RW1CS;
> +
> +    if ( bits_to_clear )
> +    {
> +        i = find_first_bit(&bits_to_clear, max_fault_index / 8 + 1);
> +        while ( i <= max_fault_index )
> +        {
> +            vvtd_clear_bit(vvtd, DMAR_FSTS_REG, i);
> +            i = find_next_bit(&bits_to_clear, max_fault_index / 8 + 1, i + 
> 1);
> +        }
> +    }
> +
> +    /*
> +     * Clear IP field when all status fields in the Fault Status Register
> +     * being clear.
> +     */
> +    if ( !((vvtd_get_reg(vvtd, DMAR_FSTS_REG) & DMA_FSTS_FAULTS)) )
> +        vvtd_clear_bit(vvtd, DMAR_FECTL_REG, DMA_FECTL_IP_SHIFT);
> +}
> +
>  static void write_gcmd_ire(struct vvtd *vvtd, uint32_t val)
>  {
>      bool set = val & DMA_GCMD_IRE;
> @@ -391,11 +629,47 @@ static int vvtd_read(struct vcpu *v, unsigned long addr,
>      return X86EMUL_OKAY;
>  }
>  
> +static void vvtd_write_fault_regs(struct vvtd *vvtd, unsigned long val,
> +                                  unsigned int offset, unsigned int len)
> +{
> +    unsigned int fault_offset = cap_fault_reg_offset(
> +                                    vvtd_get_reg_quad(vvtd, DMAR_CAP_REG));
> +
> +    spin_lock(&vvtd->fe_lock);
> +    for ( ; len ; len -= 4, offset += 4, val = val >> 32)

It seems overkill to use a for loop here when len can only be 4 or 8
AFAICT (maybe I'm wrong). Is 64bit access really allowed to those
registers? You seem to treat all of them as 32bit registers which
makes me wonder if 64bit accesses are really allowed.

Thanks, Roger.

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.