Xen project Mailing List

Re: [PATCH v2 2/8] xen/arm: revert atomic operation related command-queue insertion patch

To: Stefano Stabellini <sstabellini@xxxxxxxxxx>

From: Rahul Singh <Rahul.Singh@xxxxxxx>

Date: Wed, 2 Dec 2020 13:05:18 +0000

Accept-language: en-US

Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none

Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=G5fbEqVvIGmniE5PW22DNYDWqoQ/SVmYTluU4rkW02I=; b=oPtWqtYZ97SSU/Bcev3cQysvIGtPr6rxnV4oJkMOZcUeSGTdeGHj335jV8duF30zE5KrTMAaCjFVD2wNGk2G2tZoGAGYuglHtxaFRp6B2x6VijMH1rzz/Lb88iSzAXG0V6lAOY+VFw/eZF8gv3kbDbE7cQB/6/nrK4VVGLuj/+jBlMn2YXTEOhkcJBXIxIFcpXlQHQkIFPc/2oxlP+OVuYO9dy+CfQPvTu3w7cWopMz5LZn+imu4zKSHp6+GF/HoxwSm6jdTFuJhCSzrTpiXfDEMOOXo9gNU6wySpgt9dGx7sY2HBWdQX5OrrFYGL1Gs/WoVFHKPlOOG/5ycxDm5pQ==

Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=Q5EEm7XixjnrT7LPcbyXA3N+d9FL6GaHxqD7kwRpGmk5ZDEHfFozfcq6tKF+YRpLVK1RZo4Cpsu0RovJQF3juO/RwQQQB4gzxopFBpopfl1151oInYSgzi77c27RfKw9MxrzV0Mr/TzI8mvParKboGdqpInFiX4V8UIM05d1tlDgaCT0T2uwFVta+a7x71vFWLVAsBc8V6qVwbnc9dcfj9SkE94TNdYjLPZSvVlcw4a57yRegI4zkOFQrvQ6yqo8+F3iApsSQoxuAqy82l0JgiEFaoct9NmsaHZxv3HECTj/KyAMLItUPZIqozRu6+hbXV/zpzE9yGKsr8ThCr/PiQ==

Authentication-results-original: kernel.org; dkim=none (message not signed) header.d=none;kernel.org; dmarc=none action=none header.from=arm.com;

Cc: "xen-devel@xxxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxxx>, Bertrand Marquis <Bertrand.Marquis@xxxxxxx>, Julien Grall <julien@xxxxxxx>, Volodymyr Babchuk <Volodymyr_Babchuk@xxxxxxxx>

Delivery-date: Wed, 02 Dec 2020 13:05:36 +0000

List-id: Xen developer discussion <xen-devel.lists.xenproject.org>

Nodisclaimer: true

Original-authentication-results: kernel.org; dkim=none (message not signed) header.d=none;kernel.org; dmarc=none action=none header.from=arm.com;

Thread-index: AQHWxBX9ux1e0/RKik+PTdn4Fk42F6ni2QmAgAD2WwA=

Thread-topic: [PATCH v2 2/8] xen/arm: revert atomic operation related command-queue insertion patch

Hello Stefano, Thanks for reviewing the code. > On 1 Dec 2020, at 10:23 pm, Stefano Stabellini <sstabellini@xxxxxxxxxx> wrote: > > On Thu, 26 Nov 2020, Rahul Singh wrote: >> Linux SMMUv3 code implements the commands-queue insertion based on >> atomic operations implemented in Linux. Atomic functions used by the >> commands-queue insertion is not implemented in XEN therefore revert the >> patch that implemented the commands-queue insertion based on atomic >> operations. >> >> Once the proper atomic operations will be available in XEN the driver >> can be updated. >> >> Reverted the commit 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b >> iommu/arm-smmu-v3: Reduce contention during command-queue insertion > > I checked 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b: this patch does more > than just reverting 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b. It looks > like it is also reverting edd0351e7bc49555d8b5ad8438a65a7ca262c9f0 and > some other commits. > > Please can you provide a complete list of reverted commits? I would like > to be able to do the reverts myself on the linux tree and see that the > driver textually matches the one on the xen tree with this patch > applied. > > Yes this patch is also reverting the commits that is based on the code that introduced the atomic-operations. I will add all the commit id in next version of the patch in commit msg. Patches that are reverted in this patch are as follows: 9e773aee8c3e1b3ba019c5c7f8435aaa836c6130 iommu/arm-smmu-v3: Batch ATC invalidation commands edd0351e7bc49555d8b5ad8438a65a7ca262c9f0 iommu/arm-smmu-v3: Batch context descriptor invalidation 4ce8da453640147101bda418640394637c1a7cfc iommu/arm-smmu-v3: Add command queue batching helpers 2af2e72b18b499fa36d3f7379fd010ff25d2a984. iommu/arm-smmu-v3: Defer TLB invalidation until ->iotlb_sync() 587e6c10a7ce89a5924fdbeff2ec524fbd6a124b iommu/arm-smmu-v3: Reduce contention during command-queue insertion Regards, Rahul > >> Signed-off-by: Rahul Singh <rahul.singh@xxxxxxx> >> --- >> xen/drivers/passthrough/arm/smmu-v3.c | 847 ++++++-------------------- >> 1 file changed, 180 insertions(+), 667 deletions(-) >> >> diff --git a/xen/drivers/passthrough/arm/smmu-v3.c >> b/xen/drivers/passthrough/arm/smmu-v3.c >> index c192544e87..97eac61ea4 100644 >> --- a/xen/drivers/passthrough/arm/smmu-v3.c >> +++ b/xen/drivers/passthrough/arm/smmu-v3.c >> @@ -330,15 +330,6 @@ >> #define CMDQ_ERR_CERROR_ABT_IDX 2 >> #define CMDQ_ERR_CERROR_ATC_INV_IDX 3 >> >> -#define CMDQ_PROD_OWNED_FLAG Q_OVERFLOW_FLAG >> - >> -/* >> - * This is used to size the command queue and therefore must be at least >> - * BITS_PER_LONG so that the valid_map works correctly (it relies on the >> - * total number of queue entries being a multiple of BITS_PER_LONG). >> - */ >> -#define CMDQ_BATCH_ENTRIES BITS_PER_LONG >> - >> #define CMDQ_0_OP GENMASK_ULL(7, 0) >> #define CMDQ_0_SSV (1UL << 11) >> >> @@ -407,8 +398,9 @@ >> #define PRIQ_1_ADDR_MASK GENMASK_ULL(63, 12) >> >> /* High-level queue structures */ >> -#define ARM_SMMU_POLL_TIMEOUT_US 1000000 /* 1s! */ >> -#define ARM_SMMU_POLL_SPIN_COUNT 10 >> +#define ARM_SMMU_POLL_TIMEOUT_US 100 >> +#define ARM_SMMU_CMDQ_SYNC_TIMEOUT_US 1000000 /* 1s! */ >> +#define ARM_SMMU_CMDQ_SYNC_SPIN_COUNT 10 >> >> #define MSI_IOVA_BASE 0x8000000 >> #define MSI_IOVA_LENGTH 0x100000 >> @@ -513,24 +505,15 @@ struct arm_smmu_cmdq_ent { >> >> #define CMDQ_OP_CMD_SYNC 0x46 >> struct { >> + u32 msidata; >> u64 msiaddr; >> } sync; >> }; >> }; >> >> struct arm_smmu_ll_queue { >> - union { >> - u64 val; >> - struct { >> - u32 prod; >> - u32 cons; >> - }; >> - struct { >> - atomic_t prod; >> - atomic_t cons; >> - } atomic; >> - u8 __pad[SMP_CACHE_BYTES]; >> - } ____cacheline_aligned_in_smp; >> + u32 prod; >> + u32 cons; >> u32 max_n_shift; >> }; >> >> @@ -548,23 +531,9 @@ struct arm_smmu_queue { >> u32 __iomem *cons_reg; >> }; >> >> -struct arm_smmu_queue_poll { >> - ktime_t timeout; >> - unsigned int delay; >> - unsigned int spin_cnt; >> - bool wfe; >> -}; >> - >> struct arm_smmu_cmdq { >> struct arm_smmu_queue q; >> - atomic_long_t *valid_map; >> - atomic_t owner_prod; >> - atomic_t lock; >> -}; >> - >> -struct arm_smmu_cmdq_batch { >> - u64 cmds[CMDQ_BATCH_ENTRIES * >> CMDQ_ENT_DWORDS]; >> - int num; >> + spinlock_t lock; >> }; >> >> struct arm_smmu_evtq { >> @@ -660,6 +629,8 @@ struct arm_smmu_device { >> >> int gerr_irq; >> int combined_irq; >> + u32 sync_nr; >> + u8 prev_cmd_opcode; >> >> unsigned long ias; /* IPA */ >> unsigned long oas; /* PA */ >> @@ -677,6 +648,12 @@ struct arm_smmu_device { >> >> struct arm_smmu_strtab_cfg strtab_cfg; >> >> + /* Hi16xx adds an extra 32 bits of goodness to its MSI payload */ >> + union { >> + u32 sync_count; >> + u64 padding; >> + }; >> + >> /* IOMMU core code handle */ >> struct iommu_device iommu; >> }; >> @@ -763,21 +740,6 @@ static void parse_driver_options(struct arm_smmu_device >> *smmu) >> } >> >> /* Low-level queue manipulation functions */ >> -static bool queue_has_space(struct arm_smmu_ll_queue *q, u32 n) >> -{ >> - u32 space, prod, cons; >> - >> - prod = Q_IDX(q, q->prod); >> - cons = Q_IDX(q, q->cons); >> - >> - if (Q_WRP(q, q->prod) == Q_WRP(q, q->cons)) >> - space = (1 << q->max_n_shift) - (prod - cons); >> - else >> - space = cons - prod; >> - >> - return space >= n; >> -} >> - >> static bool queue_full(struct arm_smmu_ll_queue *q) >> { >> return Q_IDX(q, q->prod) == Q_IDX(q, q->cons) && >> @@ -790,12 +752,9 @@ static bool queue_empty(struct arm_smmu_ll_queue *q) >> Q_WRP(q, q->prod) == Q_WRP(q, q->cons); >> } >> >> -static bool queue_consumed(struct arm_smmu_ll_queue *q, u32 prod) >> +static void queue_sync_cons_in(struct arm_smmu_queue *q) >> { >> - return ((Q_WRP(q, q->cons) == Q_WRP(q, prod)) && >> - (Q_IDX(q, q->cons) > Q_IDX(q, prod))) || >> - ((Q_WRP(q, q->cons) != Q_WRP(q, prod)) && >> - (Q_IDX(q, q->cons) <= Q_IDX(q, prod))); >> + q->llq.cons = readl_relaxed(q->cons_reg); >> } >> >> static void queue_sync_cons_out(struct arm_smmu_queue *q) >> @@ -826,34 +785,46 @@ static int queue_sync_prod_in(struct arm_smmu_queue *q) >> return ret; >> } >> >> -static u32 queue_inc_prod_n(struct arm_smmu_ll_queue *q, int n) >> +static void queue_sync_prod_out(struct arm_smmu_queue *q) >> { >> - u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + n; >> - return Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); >> + writel(q->llq.prod, q->prod_reg); >> } >> >> -static void queue_poll_init(struct arm_smmu_device *smmu, >> - struct arm_smmu_queue_poll *qp) >> +static void queue_inc_prod(struct arm_smmu_ll_queue *q) >> { >> - qp->delay = 1; >> - qp->spin_cnt = 0; >> - qp->wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); >> - qp->timeout = ktime_add_us(ktime_get(), ARM_SMMU_POLL_TIMEOUT_US); >> + u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1; >> + q->prod = Q_OVF(q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod); >> } >> >> -static int queue_poll(struct arm_smmu_queue_poll *qp) >> +/* >> + * Wait for the SMMU to consume items. If sync is true, wait until the queue >> + * is empty. Otherwise, wait until there is at least one free slot. >> + */ >> +static int queue_poll_cons(struct arm_smmu_queue *q, bool sync, bool wfe) >> { >> - if (ktime_compare(ktime_get(), qp->timeout) > 0) >> - return -ETIMEDOUT; >> + ktime_t timeout; >> + unsigned int delay = 1, spin_cnt = 0; >> >> - if (qp->wfe) { >> - wfe(); >> - } else if (++qp->spin_cnt < ARM_SMMU_POLL_SPIN_COUNT) { >> - cpu_relax(); >> - } else { >> - udelay(qp->delay); >> - qp->delay *= 2; >> - qp->spin_cnt = 0; >> + /* Wait longer if it's a CMD_SYNC */ >> + timeout = ktime_add_us(ktime_get(), sync ? >> + ARM_SMMU_CMDQ_SYNC_TIMEOUT_US : >> + ARM_SMMU_POLL_TIMEOUT_US); >> + >> + while (queue_sync_cons_in(q), >> + (sync ? !queue_empty(&q->llq) : queue_full(&q->llq))) { >> + if (ktime_compare(ktime_get(), timeout) > 0) >> + return -ETIMEDOUT; >> + >> + if (wfe) { >> + wfe(); >> + } else if (++spin_cnt < ARM_SMMU_CMDQ_SYNC_SPIN_COUNT) { >> + cpu_relax(); >> + continue; >> + } else { >> + udelay(delay); >> + delay *= 2; >> + spin_cnt = 0; >> + } >> } >> >> return 0; >> @@ -867,6 +838,17 @@ static void queue_write(__le64 *dst, u64 *src, size_t >> n_dwords) >> *dst++ = cpu_to_le64(*src++); >> } >> >> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent) >> +{ >> + if (queue_full(&q->llq)) >> + return -ENOSPC; >> + >> + queue_write(Q_ENT(q, q->llq.prod), ent, q->ent_dwords); >> + queue_inc_prod(&q->llq); >> + queue_sync_prod_out(q); >> + return 0; >> +} >> + >> static void queue_read(__le64 *dst, u64 *src, size_t n_dwords) >> { >> int i; >> @@ -964,14 +946,20 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct >> arm_smmu_cmdq_ent *ent) >> cmd[1] |= FIELD_PREP(CMDQ_PRI_1_RESP, ent->pri.resp); >> break; >> case CMDQ_OP_CMD_SYNC: >> - if (ent->sync.msiaddr) { >> + if (ent->sync.msiaddr) >> cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, >> CMDQ_SYNC_0_CS_IRQ); >> - cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; >> - } else { >> + else >> cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_CS, >> CMDQ_SYNC_0_CS_SEV); >> - } >> cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSH, ARM_SMMU_SH_ISH); >> cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIATTR, >> ARM_SMMU_MEMATTR_OIWB); >> + /* >> + * Commands are written little-endian, but we want the SMMU to >> + * receive MSIData, and thus write it back to memory, in CPU >> + * byte order, so big-endian needs an extra byteswap here. >> + */ >> + cmd[0] |= FIELD_PREP(CMDQ_SYNC_0_MSIDATA, >> + cpu_to_le32(ent->sync.msidata)); >> + cmd[1] |= ent->sync.msiaddr & CMDQ_SYNC_1_MSIADDR_MASK; >> break; >> default: >> return -ENOENT; >> @@ -980,27 +968,6 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct >> arm_smmu_cmdq_ent *ent) >> return 0; >> } >> >> -static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device >> *smmu, >> - u32 prod) >> -{ >> - struct arm_smmu_queue *q = &smmu->cmdq.q; >> - struct arm_smmu_cmdq_ent ent = { >> - .opcode = CMDQ_OP_CMD_SYNC, >> - }; >> - >> - /* >> - * Beware that Hi16xx adds an extra 32 bits of goodness to its MSI >> - * payload, so the write will zero the entire command on that platform. >> - */ >> - if (smmu->features & ARM_SMMU_FEAT_MSI && >> - smmu->features & ARM_SMMU_FEAT_COHERENCY) { >> - ent.sync.msiaddr = q->base_dma + Q_IDX(&q->llq, prod) * >> - q->ent_dwords * 8; >> - } >> - >> - arm_smmu_cmdq_build_cmd(cmd, &ent); >> -} >> - >> static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu) >> { >> static const char *cerror_str[] = { >> @@ -1058,474 +1025,109 @@ static void arm_smmu_cmdq_skip_err(struct >> arm_smmu_device *smmu) >> queue_write(Q_ENT(q, cons), cmd, q->ent_dwords); >> } >> >> -/* >> - * Command queue locking. >> - * This is a form of bastardised rwlock with the following major changes: >> - * >> - * - The only LOCK routines are exclusive_trylock() and shared_lock(). >> - * Neither have barrier semantics, and instead provide only a control >> - * dependency. >> - * >> - * - The UNLOCK routines are supplemented with shared_tryunlock(), which >> - * fails if the caller appears to be the last lock holder (yes, this is >> - * racy). All successful UNLOCK routines have RELEASE semantics. >> - */ >> -static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) >> +static void arm_smmu_cmdq_insert_cmd(struct arm_smmu_device *smmu, u64 *cmd) >> { >> - int val; >> - >> - /* >> - * We can try to avoid the cmpxchg() loop by simply incrementing the >> - * lock counter. When held in exclusive state, the lock counter is set >> - * to INT_MIN so these increments won't hurt as the value will remain >> - * negative. >> - */ >> - if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) >> - return; >> - >> - do { >> - val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); >> - } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); >> -} >> - >> -static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) >> -{ >> - (void)atomic_dec_return_release(&cmdq->lock); >> -} >> - >> -static bool arm_smmu_cmdq_shared_tryunlock(struct arm_smmu_cmdq *cmdq) >> -{ >> - if (atomic_read(&cmdq->lock) == 1) >> - return false; >> - >> - arm_smmu_cmdq_shared_unlock(cmdq); >> - return true; >> -} >> - >> -#define arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags) >> \ >> -({ \ >> - bool __ret; \ >> - local_irq_save(flags); \ >> - __ret = !atomic_cmpxchg_relaxed(&cmdq->lock, 0, INT_MIN); \ >> - if (!__ret) \ >> - local_irq_restore(flags); \ >> - __ret; \ >> -}) >> - >> -#define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) >> \ >> -({ \ >> - atomic_set_release(&cmdq->lock, 0); \ >> - local_irq_restore(flags); \ >> -}) >> - >> - >> -/* >> - * Command queue insertion. >> - * This is made fiddly by our attempts to achieve some sort of scalability >> - * since there is one queue shared amongst all of the CPUs in the system. >> If >> - * you like mixed-size concurrency, dependency ordering and relaxed atomics, >> - * then you'll *love* this monstrosity. >> - * >> - * The basic idea is to split the queue up into ranges of commands that are >> - * owned by a given CPU; the owner may not have written all of the commands >> - * itself, but is responsible for advancing the hardware prod pointer when >> - * the time comes. The algorithm is roughly: >> - * >> - * 1. Allocate some space in the queue. At this point we also discover >> - * whether the head of the queue is currently owned by another CPU, >> - * or whether we are the owner. >> - * >> - * 2. Write our commands into our allocated slots in the queue. >> - * >> - * 3. Mark our slots as valid in arm_smmu_cmdq.valid_map. >> - * >> - * 4. If we are an owner: >> - * a. Wait for the previous owner to finish. >> - * b. Mark the queue head as unowned, which tells us the range >> - * that we are responsible for publishing. >> - * c. Wait for all commands in our owned range to become valid. >> - * d. Advance the hardware prod pointer. >> - * e. Tell the next owner we've finished. >> - * >> - * 5. If we are inserting a CMD_SYNC (we may or may not have been an >> - * owner), then we need to stick around until it has completed: >> - * a. If we have MSIs, the SMMU can write back into the CMD_SYNC >> - * to clear the first 4 bytes. >> - * b. Otherwise, we spin waiting for the hardware cons pointer to >> - * advance past our command. >> - * >> - * The devil is in the details, particularly the use of locking for handling >> - * SYNC completion and freeing up space in the queue before we think that >> it is >> - * full. >> - */ >> -static void __arm_smmu_cmdq_poll_set_valid_map(struct arm_smmu_cmdq *cmdq, >> - u32 sprod, u32 eprod, bool set) >> -{ >> - u32 swidx, sbidx, ewidx, ebidx; >> - struct arm_smmu_ll_queue llq = { >> - .max_n_shift = cmdq->q.llq.max_n_shift, >> - .prod = sprod, >> - }; >> - >> - ewidx = BIT_WORD(Q_IDX(&llq, eprod)); >> - ebidx = Q_IDX(&llq, eprod) % BITS_PER_LONG; >> - >> - while (llq.prod != eprod) { >> - unsigned long mask; >> - atomic_long_t *ptr; >> - u32 limit = BITS_PER_LONG; >> - >> - swidx = BIT_WORD(Q_IDX(&llq, llq.prod)); >> - sbidx = Q_IDX(&llq, llq.prod) % BITS_PER_LONG; >> - >> - ptr = &cmdq->valid_map[swidx]; >> - >> - if ((swidx == ewidx) && (sbidx < ebidx)) >> - limit = ebidx; >> - >> - mask = GENMASK(limit - 1, sbidx); >> - >> - /* >> - * The valid bit is the inverse of the wrap bit. This means >> - * that a zero-initialised queue is invalid and, after marking >> - * all entries as valid, they become invalid again when we >> - * wrap. >> - */ >> - if (set) { >> - atomic_long_xor(mask, ptr); >> - } else { /* Poll */ >> - unsigned long valid; >> + struct arm_smmu_queue *q = &smmu->cmdq.q; >> + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); >> >> - valid = (ULONG_MAX + !!Q_WRP(&llq, llq.prod)) & mask; >> - atomic_long_cond_read_relaxed(ptr, (VAL & mask) == >> valid); >> - } >> + smmu->prev_cmd_opcode = FIELD_GET(CMDQ_0_OP, cmd[0]); >> >> - llq.prod = queue_inc_prod_n(&llq, limit - sbidx); >> + while (queue_insert_raw(q, cmd) == -ENOSPC) { >> + if (queue_poll_cons(q, false, wfe)) >> + dev_err_ratelimited(smmu->dev, "CMDQ timeout\n"); >> } >> } >> >> -/* Mark all entries in the range [sprod, eprod) as valid */ >> -static void arm_smmu_cmdq_set_valid_map(struct arm_smmu_cmdq *cmdq, >> - u32 sprod, u32 eprod) >> -{ >> - __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, true); >> -} >> - >> -/* Wait for all entries in the range [sprod, eprod) to become valid */ >> -static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq, >> - u32 sprod, u32 eprod) >> -{ >> - __arm_smmu_cmdq_poll_set_valid_map(cmdq, sprod, eprod, false); >> -} >> - >> -/* Wait for the command queue to become non-full */ >> -static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu, >> - struct arm_smmu_ll_queue *llq) >> +static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, >> + struct arm_smmu_cmdq_ent *ent) >> { >> + u64 cmd[CMDQ_ENT_DWORDS]; >> unsigned long flags; >> - struct arm_smmu_queue_poll qp; >> - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> - int ret = 0; >> >> - /* >> - * Try to update our copy of cons by grabbing exclusive cmdq access. If >> - * that fails, spin until somebody else updates it for us. >> - */ >> - if (arm_smmu_cmdq_exclusive_trylock_irqsave(cmdq, flags)) { >> - WRITE_ONCE(cmdq->q.llq.cons, readl_relaxed(cmdq->q.cons_reg)); >> - arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags); >> - llq->val = READ_ONCE(cmdq->q.llq.val); >> - return 0; >> + if (arm_smmu_cmdq_build_cmd(cmd, ent)) { >> + dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", >> + ent->opcode); >> + return; >> } >> >> - queue_poll_init(smmu, &qp); >> - do { >> - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); >> - if (!queue_full(llq)) >> - break; >> - >> - ret = queue_poll(&qp); >> - } while (!ret); >> - >> - return ret; >> -} >> - >> -/* >> - * Wait until the SMMU signals a CMD_SYNC completion MSI. >> - * Must be called with the cmdq lock held in some capacity. >> - */ >> -static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu, >> - struct arm_smmu_ll_queue *llq) >> -{ >> - int ret = 0; >> - struct arm_smmu_queue_poll qp; >> - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> - u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod)); >> - >> - queue_poll_init(smmu, &qp); >> - >> - /* >> - * The MSI won't generate an event, since it's being written back >> - * into the command queue. >> - */ >> - qp.wfe = false; >> - smp_cond_load_relaxed(cmd, !VAL || (ret = queue_poll(&qp))); >> - llq->cons = ret ? llq->prod : queue_inc_prod_n(llq, 1); >> - return ret; >> + spin_lock_irqsave(&smmu->cmdq.lock, flags); >> + arm_smmu_cmdq_insert_cmd(smmu, cmd); >> + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); >> } >> >> /* >> - * Wait until the SMMU cons index passes llq->prod. >> - * Must be called with the cmdq lock held in some capacity. >> + * The difference between val and sync_idx is bounded by the maximum size of >> + * a queue at 2^20 entries, so 32 bits is plenty for wrap-safe arithmetic. >> */ >> -static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu, >> - struct arm_smmu_ll_queue *llq) >> +static int __arm_smmu_sync_poll_msi(struct arm_smmu_device *smmu, u32 >> sync_idx) >> { >> - struct arm_smmu_queue_poll qp; >> - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> - u32 prod = llq->prod; >> - int ret = 0; >> + ktime_t timeout; >> + u32 val; >> >> - queue_poll_init(smmu, &qp); >> - llq->val = READ_ONCE(smmu->cmdq.q.llq.val); >> - do { >> - if (queue_consumed(llq, prod)) >> - break; >> - >> - ret = queue_poll(&qp); >> - >> - /* >> - * This needs to be a readl() so that our subsequent call >> - * to arm_smmu_cmdq_shared_tryunlock() can fail accurately. >> - * >> - * Specifically, we need to ensure that we observe all >> - * shared_lock()s by other CMD_SYNCs that share our owner, >> - * so that a failing call to tryunlock() means that we're >> - * the last one out and therefore we can safely advance >> - * cmdq->q.llq.cons. Roughly speaking: >> - * >> - * CPU 0 CPU1 CPU2 (us) >> - * >> - * if (sync) >> - * shared_lock(); >> - * >> - * dma_wmb(); >> - * set_valid_map(); >> - * >> - * if (owner) { >> - * poll_valid_map(); >> - * <control dependency> >> - * writel(prod_reg); >> - * >> - * readl(cons_reg); >> - * tryunlock(); >> - * >> - * Requires us to see CPU 0's shared_lock() acquisition. >> - */ >> - llq->cons = readl(cmdq->q.cons_reg); >> - } while (!ret); >> + timeout = ktime_add_us(ktime_get(), ARM_SMMU_CMDQ_SYNC_TIMEOUT_US); >> + val = smp_cond_load_acquire(&smmu->sync_count, >> + (int)(VAL - sync_idx) >= 0 || >> + !ktime_before(ktime_get(), timeout)); >> >> - return ret; >> + return (int)(val - sync_idx) < 0 ? -ETIMEDOUT : 0; >> } >> >> -static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu, >> - struct arm_smmu_ll_queue *llq) >> +static int __arm_smmu_cmdq_issue_sync_msi(struct arm_smmu_device *smmu) >> { >> - if (smmu->features & ARM_SMMU_FEAT_MSI && >> - smmu->features & ARM_SMMU_FEAT_COHERENCY) >> - return __arm_smmu_cmdq_poll_until_msi(smmu, llq); >> - >> - return __arm_smmu_cmdq_poll_until_consumed(smmu, llq); >> -} >> - >> -static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 >> *cmds, >> - u32 prod, int n) >> -{ >> - int i; >> - struct arm_smmu_ll_queue llq = { >> - .max_n_shift = cmdq->q.llq.max_n_shift, >> - .prod = prod, >> - }; >> - >> - for (i = 0; i < n; ++i) { >> - u64 *cmd = &cmds[i * CMDQ_ENT_DWORDS]; >> - >> - prod = queue_inc_prod_n(&llq, i); >> - queue_write(Q_ENT(&cmdq->q, prod), cmd, CMDQ_ENT_DWORDS); >> - } >> -} >> - >> -/* >> - * This is the actual insertion function, and provides the following >> - * ordering guarantees to callers: >> - * >> - * - There is a dma_wmb() before publishing any commands to the queue. >> - * This can be relied upon to order prior writes to data structures >> - * in memory (such as a CD or an STE) before the command. >> - * >> - * - On completion of a CMD_SYNC, there is a control dependency. >> - * This can be relied upon to order subsequent writes to memory (e.g. >> - * freeing an IOVA) after completion of the CMD_SYNC. >> - * >> - * - Command insertion is totally ordered, so if two CPUs each race to >> - * insert their own list of commands then all of the commands from one >> - * CPU will appear before any of the commands from the other CPU. >> - */ >> -static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, >> - u64 *cmds, int n, bool sync) >> -{ >> - u64 cmd_sync[CMDQ_ENT_DWORDS]; >> - u32 prod; >> + u64 cmd[CMDQ_ENT_DWORDS]; >> unsigned long flags; >> - bool owner; >> - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> - struct arm_smmu_ll_queue llq = { >> - .max_n_shift = cmdq->q.llq.max_n_shift, >> - }, head = llq; >> - int ret = 0; >> - >> - /* 1. Allocate some space in the queue */ >> - local_irq_save(flags); >> - llq.val = READ_ONCE(cmdq->q.llq.val); >> - do { >> - u64 old; >> - >> - while (!queue_has_space(&llq, n + sync)) { >> - local_irq_restore(flags); >> - if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq)) >> - dev_err_ratelimited(smmu->dev, "CMDQ >> timeout\n"); >> - local_irq_save(flags); >> - } >> - >> - head.cons = llq.cons; >> - head.prod = queue_inc_prod_n(&llq, n + sync) | >> - CMDQ_PROD_OWNED_FLAG; >> - >> - old = cmpxchg_relaxed(&cmdq->q.llq.val, llq.val, head.val); >> - if (old == llq.val) >> - break; >> - >> - llq.val = old; >> - } while (1); >> - owner = !(llq.prod & CMDQ_PROD_OWNED_FLAG); >> - head.prod &= ~CMDQ_PROD_OWNED_FLAG; >> - llq.prod &= ~CMDQ_PROD_OWNED_FLAG; >> - >> - /* >> - * 2. Write our commands into the queue >> - * Dependency ordering from the cmpxchg() loop above. >> - */ >> - arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n); >> - if (sync) { >> - prod = queue_inc_prod_n(&llq, n); >> - arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, prod); >> - queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS); >> - >> - /* >> - * In order to determine completion of our CMD_SYNC, we must >> - * ensure that the queue can't wrap twice without us noticing. >> - * We achieve that by taking the cmdq lock as shared before >> - * marking our slot as valid. >> - */ >> - arm_smmu_cmdq_shared_lock(cmdq); >> - } >> - >> - /* 3. Mark our slots as valid, ensuring commands are visible first */ >> - dma_wmb(); >> - arm_smmu_cmdq_set_valid_map(cmdq, llq.prod, head.prod); >> - >> - /* 4. If we are the owner, take control of the SMMU hardware */ >> - if (owner) { >> - /* a. Wait for previous owner to finish */ >> - atomic_cond_read_relaxed(&cmdq->owner_prod, VAL == llq.prod); >> - >> - /* b. Stop gathering work by clearing the owned flag */ >> - prod = atomic_fetch_andnot_relaxed(CMDQ_PROD_OWNED_FLAG, >> - &cmdq->q.llq.atomic.prod); >> - prod &= ~CMDQ_PROD_OWNED_FLAG; >> + struct arm_smmu_cmdq_ent ent = { >> + .opcode = CMDQ_OP_CMD_SYNC, >> + .sync = { >> + .msiaddr = virt_to_phys(&smmu->sync_count), >> + }, >> + }; >> >> - /* >> - * c. Wait for any gathered work to be written to the queue. >> - * Note that we read our own entries so that we have the control >> - * dependency required by (d). >> - */ >> - arm_smmu_cmdq_poll_valid_map(cmdq, llq.prod, prod); >> + spin_lock_irqsave(&smmu->cmdq.lock, flags); >> >> - /* >> - * d. Advance the hardware prod pointer >> - * Control dependency ordering from the entries becoming valid. >> - */ >> - writel_relaxed(prod, cmdq->q.prod_reg); >> - >> - /* >> - * e. Tell the next owner we're done >> - * Make sure we've updated the hardware first, so that we don't >> - * race to update prod and potentially move it backwards. >> - */ >> - atomic_set_release(&cmdq->owner_prod, prod); >> + /* Piggy-back on the previous command if it's a SYNC */ >> + if (smmu->prev_cmd_opcode == CMDQ_OP_CMD_SYNC) { >> + ent.sync.msidata = smmu->sync_nr; >> + } else { >> + ent.sync.msidata = ++smmu->sync_nr; >> + arm_smmu_cmdq_build_cmd(cmd, &ent); >> + arm_smmu_cmdq_insert_cmd(smmu, cmd); >> } >> >> - /* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */ >> - if (sync) { >> - llq.prod = queue_inc_prod_n(&llq, n); >> - ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq); >> - if (ret) { >> - dev_err_ratelimited(smmu->dev, >> - "CMD_SYNC timeout at 0x%08x [hwprod >> 0x%08x, hwcons 0x%08x]\n", >> - llq.prod, >> - readl_relaxed(cmdq->q.prod_reg), >> - readl_relaxed(cmdq->q.cons_reg)); >> - } >> - >> - /* >> - * Try to unlock the cmdq lock. This will fail if we're the last >> - * reader, in which case we can safely update cmdq->q.llq.cons >> - */ >> - if (!arm_smmu_cmdq_shared_tryunlock(cmdq)) { >> - WRITE_ONCE(cmdq->q.llq.cons, llq.cons); >> - arm_smmu_cmdq_shared_unlock(cmdq); >> - } >> - } >> + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); >> >> - local_irq_restore(flags); >> - return ret; >> + return __arm_smmu_sync_poll_msi(smmu, ent.sync.msidata); >> } >> >> -static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, >> - struct arm_smmu_cmdq_ent *ent) >> +static int __arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) >> { >> u64 cmd[CMDQ_ENT_DWORDS]; >> + unsigned long flags; >> + bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV); >> + struct arm_smmu_cmdq_ent ent = { .opcode = CMDQ_OP_CMD_SYNC }; >> + int ret; >> >> - if (arm_smmu_cmdq_build_cmd(cmd, ent)) { >> - dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", >> - ent->opcode); >> - return -EINVAL; >> - } >> + arm_smmu_cmdq_build_cmd(cmd, &ent); >> >> - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); >> -} >> + spin_lock_irqsave(&smmu->cmdq.lock, flags); >> + arm_smmu_cmdq_insert_cmd(smmu, cmd); >> + ret = queue_poll_cons(&smmu->cmdq.q, true, wfe); >> + spin_unlock_irqrestore(&smmu->cmdq.lock, flags); >> >> -static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) >> -{ >> - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); >> + return ret; >> } >> >> -static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, >> - struct arm_smmu_cmdq_batch *cmds, >> - struct arm_smmu_cmdq_ent *cmd) >> +static int arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) >> { >> - if (cmds->num == CMDQ_BATCH_ENTRIES) { >> - arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false); >> - cmds->num = 0; >> - } >> - arm_smmu_cmdq_build_cmd(&cmds->cmds[cmds->num * CMDQ_ENT_DWORDS], cmd); >> - cmds->num++; >> -} >> + int ret; >> + bool msi = (smmu->features & ARM_SMMU_FEAT_MSI) && >> + (smmu->features & ARM_SMMU_FEAT_COHERENCY); >> >> -static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu, >> - struct arm_smmu_cmdq_batch *cmds) >> -{ >> - return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true); >> + ret = msi ? __arm_smmu_cmdq_issue_sync_msi(smmu) >> + : __arm_smmu_cmdq_issue_sync(smmu); >> + if (ret) >> + dev_err_ratelimited(smmu->dev, "CMD_SYNC timeout\n"); >> + return ret; >> } >> >> /* Context descriptor manipulation functions */ >> @@ -1535,7 +1137,6 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain >> *smmu_domain, >> size_t i; >> unsigned long flags; >> struct arm_smmu_master *master; >> - struct arm_smmu_cmdq_batch cmds = {}; >> struct arm_smmu_device *smmu = smmu_domain->smmu; >> struct arm_smmu_cmdq_ent cmd = { >> .opcode = CMDQ_OP_CFGI_CD, >> @@ -1549,12 +1150,12 @@ static void arm_smmu_sync_cd(struct arm_smmu_domain >> *smmu_domain, >> list_for_each_entry(master, &smmu_domain->devices, domain_head) { >> for (i = 0; i < master->num_sids; i++) { >> cmd.cfgi.sid = master->sids[i]; >> - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); >> + arm_smmu_cmdq_issue_cmd(smmu, &cmd); >> } >> } >> spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); >> >> - arm_smmu_cmdq_batch_submit(smmu, &cmds); >> + arm_smmu_cmdq_issue_sync(smmu); >> } >> >> static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu, >> @@ -2189,16 +1790,17 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long >> iova, size_t size, >> cmd->atc.size = log2_span; >> } >> >> -static int arm_smmu_atc_inv_master(struct arm_smmu_master *master) >> +static int arm_smmu_atc_inv_master(struct arm_smmu_master *master, >> + struct arm_smmu_cmdq_ent *cmd) >> { >> int i; >> - struct arm_smmu_cmdq_ent cmd; >> >> - arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); >> + if (!master->ats_enabled) >> + return 0; >> >> for (i = 0; i < master->num_sids; i++) { >> - cmd.atc.sid = master->sids[i]; >> - arm_smmu_cmdq_issue_cmd(master->smmu, &cmd); >> + cmd->atc.sid = master->sids[i]; >> + arm_smmu_cmdq_issue_cmd(master->smmu, cmd); >> } >> >> return arm_smmu_cmdq_issue_sync(master->smmu); >> @@ -2207,11 +1809,10 @@ static int arm_smmu_atc_inv_master(struct >> arm_smmu_master *master) >> static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, >> int ssid, unsigned long iova, size_t size) >> { >> - int i; >> + int ret = 0; >> unsigned long flags; >> struct arm_smmu_cmdq_ent cmd; >> struct arm_smmu_master *master; >> - struct arm_smmu_cmdq_batch cmds = {}; >> >> if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) >> return 0; >> @@ -2236,18 +1837,11 @@ static int arm_smmu_atc_inv_domain(struct >> arm_smmu_domain *smmu_domain, >> arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd); >> >> spin_lock_irqsave(&smmu_domain->devices_lock, flags); >> - list_for_each_entry(master, &smmu_domain->devices, domain_head) { >> - if (!master->ats_enabled) >> - continue; >> - >> - for (i = 0; i < master->num_sids; i++) { >> - cmd.atc.sid = master->sids[i]; >> - arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd); >> - } >> - } >> + list_for_each_entry(master, &smmu_domain->devices, domain_head) >> + ret |= arm_smmu_atc_inv_master(master, &cmd); >> spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); >> >> - return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds); >> + return ret ? -ETIMEDOUT : 0; >> } >> >> /* IO_PGTABLE API */ >> @@ -2269,32 +1863,27 @@ static void arm_smmu_tlb_inv_context(void *cookie) >> /* >> * NOTE: when io-pgtable is in non-strict mode, we may get here with >> * PTEs previously cleared by unmaps on the current CPU not yet visible >> - * to the SMMU. We are relying on the dma_wmb() implicit during cmd >> - * insertion to guarantee those are observed before the TLBI. Do be >> - * careful, 007. >> + * to the SMMU. We are relying on the DSB implicit in >> + * queue_sync_prod_out() to guarantee those are observed before the >> + * TLBI. Do be careful, 007. >> */ >> arm_smmu_cmdq_issue_cmd(smmu, &cmd); >> arm_smmu_cmdq_issue_sync(smmu); >> arm_smmu_atc_inv_domain(smmu_domain, 0, 0, 0); >> } >> >> -static void arm_smmu_tlb_inv_range(unsigned long iova, size_t size, >> - size_t granule, bool leaf, >> - struct arm_smmu_domain *smmu_domain) >> +static void arm_smmu_tlb_inv_range_nosync(unsigned long iova, size_t size, >> + size_t granule, bool leaf, void >> *cookie) >> { >> + struct arm_smmu_domain *smmu_domain = cookie; >> struct arm_smmu_device *smmu = smmu_domain->smmu; >> - unsigned long start = iova, end = iova + size, num_pages = 0, tg = 0; >> - size_t inv_range = granule; >> - struct arm_smmu_cmdq_batch cmds = {}; >> struct arm_smmu_cmdq_ent cmd = { >> .tlbi = { >> .leaf = leaf, >> + .addr = iova, >> }, >> }; >> >> - if (!size) >> - return; >> - >> if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { >> cmd.opcode = CMDQ_OP_TLBI_NH_VA; >> cmd.tlbi.asid = smmu_domain->s1_cfg.cd.asid; >> @@ -2303,78 +1892,37 @@ static void arm_smmu_tlb_inv_range(unsigned long >> iova, size_t size, >> cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; >> } >> >> - if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { >> - /* Get the leaf page size */ >> - tg = __ffs(smmu_domain->domain.pgsize_bitmap); >> - >> - /* Convert page size of 12,14,16 (log2) to 1,2,3 */ >> - cmd.tlbi.tg = (tg - 10) / 2; >> - >> - /* Determine what level the granule is at */ >> - cmd.tlbi.ttl = 4 - ((ilog2(granule) - 3) / (tg - 3)); >> - >> - num_pages = size >> tg; >> - } >> - >> - while (iova < end) { >> - if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { >> - /* >> - * On each iteration of the loop, the range is 5 bits >> - * worth of the aligned size remaining. >> - * The range in pages is: >> - * >> - * range = (num_pages & (0x1f << __ffs(num_pages))) >> - */ >> - unsigned long scale, num; >> - >> - /* Determine the power of 2 multiple number of pages */ >> - scale = __ffs(num_pages); >> - cmd.tlbi.scale = scale; >> - >> - /* Determine how many chunks of 2^scale size we have */ >> - num = (num_pages >> scale) & CMDQ_TLBI_RANGE_NUM_MAX; >> - cmd.tlbi.num = num - 1; >> - >> - /* range is num * 2^scale * pgsize */ >> - inv_range = num << (scale + tg); >> - >> - /* Clear out the lower order bits for the next >> iteration */ >> - num_pages -= num << scale; >> - } >> - >> - cmd.tlbi.addr = iova; >> - arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); >> - iova += inv_range; >> - } >> - arm_smmu_cmdq_batch_submit(smmu, &cmds); >> - >> - /* >> - * Unfortunately, this can't be leaf-only since we may have >> - * zapped an entire table. >> - */ >> - arm_smmu_atc_inv_domain(smmu_domain, 0, start, size); >> + do { >> + arm_smmu_cmdq_issue_cmd(smmu, &cmd); >> + cmd.tlbi.addr += granule; >> + } while (size -= granule); >> } >> >> static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, >> unsigned long iova, size_t granule, >> void *cookie) >> { >> - struct arm_smmu_domain *smmu_domain = cookie; >> - struct iommu_domain *domain = &smmu_domain->domain; >> - >> - iommu_iotlb_gather_add_page(domain, gather, iova, granule); >> + arm_smmu_tlb_inv_range_nosync(iova, granule, granule, true, cookie); >> } >> >> static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, >> size_t granule, void *cookie) >> { >> - arm_smmu_tlb_inv_range(iova, size, granule, false, cookie); >> + struct arm_smmu_domain *smmu_domain = cookie; >> + struct arm_smmu_device *smmu = smmu_domain->smmu; >> + >> + arm_smmu_tlb_inv_range_nosync(iova, size, granule, false, cookie); >> + arm_smmu_cmdq_issue_sync(smmu); >> } >> >> static void arm_smmu_tlb_inv_leaf(unsigned long iova, size_t size, >> size_t granule, void *cookie) >> { >> - arm_smmu_tlb_inv_range(iova, size, granule, true, cookie); >> + struct arm_smmu_domain *smmu_domain = cookie; >> + struct arm_smmu_device *smmu = smmu_domain->smmu; >> + >> + arm_smmu_tlb_inv_range_nosync(iova, size, granule, true, cookie); >> + arm_smmu_cmdq_issue_sync(smmu); >> } >> >> static const struct iommu_flush_ops arm_smmu_flush_ops = { >> @@ -2700,6 +2248,7 @@ static void arm_smmu_enable_ats(struct arm_smmu_master >> *master) >> >> static void arm_smmu_disable_ats(struct arm_smmu_master *master) >> { >> + struct arm_smmu_cmdq_ent cmd; >> struct arm_smmu_domain *smmu_domain = master->domain; >> >> if (!master->ats_enabled) >> @@ -2711,8 +2260,9 @@ static void arm_smmu_disable_ats(struct >> arm_smmu_master *master) >> * ATC invalidation via the SMMU. >> */ >> wmb(); >> - arm_smmu_atc_inv_master(master); >> - atomic_dec(&smmu_domain->nr_ats_masters); >> + arm_smmu_atc_inv_to_cmd(0, 0, 0, &cmd); >> + arm_smmu_atc_inv_master(master, &cmd); >> + atomic_dec(&smmu_domain->nr_ats_masters); >> } >> >> static int arm_smmu_enable_pasid(struct arm_smmu_master *master) >> @@ -2875,10 +2425,10 @@ static void arm_smmu_flush_iotlb_all(struct >> iommu_domain *domain) >> static void arm_smmu_iotlb_sync(struct iommu_domain *domain, >> struct iommu_iotlb_gather *gather) >> { >> - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> + struct arm_smmu_device *smmu = to_smmu_domain(domain)->smmu; >> >> - arm_smmu_tlb_inv_range(gather->start, gather->end - gather->start, >> - gather->pgsize, true, smmu_domain); >> + if (smmu) >> + arm_smmu_cmdq_issue_sync(smmu); >> } >> >> static phys_addr_t >> @@ -3176,49 +2726,18 @@ static int arm_smmu_init_one_queue(struct >> arm_smmu_device *smmu, >> return 0; >> } >> >> -static void arm_smmu_cmdq_free_bitmap(void *data) >> -{ >> - unsigned long *bitmap = data; >> - bitmap_free(bitmap); >> -} >> - >> -static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu) >> -{ >> - int ret = 0; >> - struct arm_smmu_cmdq *cmdq = &smmu->cmdq; >> - unsigned int nents = 1 << cmdq->q.llq.max_n_shift; >> - atomic_long_t *bitmap; >> - >> - atomic_set(&cmdq->owner_prod, 0); >> - atomic_set(&cmdq->lock, 0); >> - >> - bitmap = (atomic_long_t *)bitmap_zalloc(nents, GFP_KERNEL); >> - if (!bitmap) { >> - dev_err(smmu->dev, "failed to allocate cmdq bitmap\n"); >> - ret = -ENOMEM; >> - } else { >> - cmdq->valid_map = bitmap; >> - devm_add_action(smmu->dev, arm_smmu_cmdq_free_bitmap, bitmap); >> - } >> - >> - return ret; >> -} >> - >> static int arm_smmu_init_queues(struct arm_smmu_device *smmu) >> { >> int ret; >> >> /* cmdq */ >> + spin_lock_init(&smmu->cmdq.lock); >> ret = arm_smmu_init_one_queue(smmu, &smmu->cmdq.q, ARM_SMMU_CMDQ_PROD, >> ARM_SMMU_CMDQ_CONS, CMDQ_ENT_DWORDS, >> "cmdq"); >> if (ret) >> return ret; >> >> - ret = arm_smmu_cmdq_init(smmu); >> - if (ret) >> - return ret; >> - >> /* evtq */ >> ret = arm_smmu_init_one_queue(smmu, &smmu->evtq.q, ARM_SMMU_EVTQ_PROD, >> ARM_SMMU_EVTQ_CONS, EVTQ_ENT_DWORDS, >> @@ -3799,15 +3318,9 @@ static int arm_smmu_device_hw_probe(struct >> arm_smmu_device *smmu) >> /* Queue sizes, capped to ensure natural alignment */ >> smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT, >> FIELD_GET(IDR1_CMDQS, reg)); >> - if (smmu->cmdq.q.llq.max_n_shift <= ilog2(CMDQ_BATCH_ENTRIES)) { >> - /* >> - * We don't support splitting up batches, so one batch of >> - * commands plus an extra sync needs to fit inside the command >> - * queue. There's also no way we can handle the weird alignment >> - * restrictions on the base pointer for a unit-length queue. >> - */ >> - dev_err(smmu->dev, "command queue size <= %d entries not >> supported\n", >> - CMDQ_BATCH_ENTRIES); >> + if (!smmu->cmdq.q.llq.max_n_shift) { >> + /* Odd alignment restrictions on the base, so ignore for now */ >> + dev_err(smmu->dev, "unit-length command queue not supported\n"); >> return -ENXIO; >> } >> >> -- >> 2.17.1 >>

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.