[Xen-devel] [PATCH 1/4] qemu-xen: backport xen_disk from upstrea

xen_disk is a pure userspace blkback implementation that can be used to
provided a disk backend called qdisk.
It is particularly useful with a dom0 kernel that doesn't have blktap2
(Linux 2.6.37).

Signed-off-by: Stefano Stabellini <stefano.stabellini@xxxxxxxxxxxxx>

diff --git a/hw/xen_backend.h b/hw/xen_backend.h
index 7e89ef4..c48c593 100644
--- a/hw/xen_backend.h
+++ b/hw/xen_backend.h
@@ -89,6 +89,7 @@ void xen_be_printf(struct XenDevice *xendev, int msg_level, 
const char *fmt, ...
 extern struct XenDevOps xen_console_ops;      /* xen_console.c     */
 extern struct XenDevOps xen_kbdmouse_ops;     /* xen_framebuffer.c */
 extern struct XenDevOps xen_framebuffer_ops;  /* xen_framebuffer.c */
+extern struct XenDevOps xen_blkdev_ops;       /* xen_disk.c        */
 
 void xen_init_display(int domid);
 
diff --git a/hw/xen_blkif.h b/hw/xen_blkif.h
new file mode 100644
index 0000000..ca3a65b
--- /dev/null
+++ b/hw/xen_blkif.h
@@ -0,0 +1,103 @@
+#ifndef __XEN_BLKIF_H__
+#define __XEN_BLKIF_H__
+
+#include <xen/io/ring.h>
+#include <xen/io/blkif.h>
+#include <xen/io/protocols.h>
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+       char dummy;
+};
+struct blkif_common_response {
+       char dummy;
+};
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_request {
+       uint8_t        operation;    /* BLKIF_OP_???                         */
+       uint8_t        nr_segments;  /* number of segments                   */
+       blkif_vdev_t   handle;       /* only for read/write requests         */
+       uint64_t       id;           /* private guest value, echoed in resp  */
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_32_response {
+       uint64_t        id;              /* copied from request */
+       uint8_t         operation;       /* copied from request */
+       int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_32_request blkif_x86_32_request_t;
+typedef struct blkif_x86_32_response blkif_x86_32_response_t;
+#pragma pack(pop)
+
+/* x86_64 protocol version */
+struct blkif_x86_64_request {
+       uint8_t        operation;    /* BLKIF_OP_???                         */
+       uint8_t        nr_segments;  /* number of segments                   */
+       blkif_vdev_t   handle;       /* only for read/write requests         */
+       uint64_t       __attribute__((__aligned__(8))) id;
+       blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+       struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+};
+struct blkif_x86_64_response {
+       uint64_t       __attribute__((__aligned__(8))) id;
+       uint8_t         operation;       /* copied from request */
+       int16_t         status;          /* BLKIF_RSP_???       */
+};
+typedef struct blkif_x86_64_request blkif_x86_64_request_t;
+typedef struct blkif_x86_64_response blkif_x86_64_response_t;
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct 
blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct 
blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct 
blkif_x86_64_response);
+
+union blkif_back_rings {
+       blkif_back_ring_t        native;
+       blkif_common_back_ring_t common;
+        blkif_x86_32_back_ring_t x86_32;
+        blkif_x86_64_back_ring_t x86_64;
+};
+typedef union blkif_back_rings blkif_back_rings_t;
+
+enum blkif_protocol {
+       BLKIF_PROTOCOL_NATIVE = 1,
+       BLKIF_PROTOCOL_X86_32 = 2,
+       BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+static inline void blkif_get_x86_32_req(blkif_request_t *dst, 
blkif_x86_32_request_t *src)
+{
+       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       dst->operation = src->operation;
+       dst->nr_segments = src->nr_segments;
+       dst->handle = src->handle;
+       dst->id = src->id;
+       dst->sector_number = src->sector_number;
+       if (n > src->nr_segments)
+               n = src->nr_segments;
+       for (i = 0; i < n; i++)
+               dst->seg[i] = src->seg[i];
+}
+
+static inline void blkif_get_x86_64_req(blkif_request_t *dst, 
blkif_x86_64_request_t *src)
+{
+       int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+       dst->operation = src->operation;
+       dst->nr_segments = src->nr_segments;
+       dst->handle = src->handle;
+       dst->id = src->id;
+       dst->sector_number = src->sector_number;
+       if (n > src->nr_segments)
+               n = src->nr_segments;
+       for (i = 0; i < n; i++)
+               dst->seg[i] = src->seg[i];
+}
+
+#endif /* __XEN_BLKIF_H__ */
diff --git a/hw/xen_disk.c b/hw/xen_disk.c
new file mode 100644
index 0000000..38b5fbf
--- /dev/null
+++ b/hw/xen_disk.c
@@ -0,0 +1,783 @@
+/*
+ *  xen paravirt block device backend
+ *
+ *  (c) Gerd Hoffmann <kraxel@xxxxxxxxxx>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <time.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/uio.h>
+
+#include <xs.h>
+#include <xenctrl.h>
+#include <xen/io/xenbus.h>
+
+#include "hw.h"
+#include "block_int.h"
+#include "qemu-char.h"
+#include "xen_blkif.h"
+#include "xen_backend.h"
+#include "sysemu.h"
+
+/* ------------------------------------------------------------- */
+
+static int syncwrite    = 1;
+static int batch_maps   = 0;
+
+static int max_requests = 32;
+static int use_aio      = 0;
+
+/* ------------------------------------------------------------- */
+
+#define BLOCK_SIZE  512
+#define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
+
+struct ioreq {
+    blkif_request_t     req;
+    int16_t             status;
+
+    /* parsed request */
+    off_t               start;
+    QEMUIOVector        v;
+    int                 presync;
+    int                 postsync;
+
+    /* grant mapping */
+    uint32_t            domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    uint32_t            refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int                 prot;
+    void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    void                *pages;
+
+    /* aio status */
+    int                 aio_inflight;
+    int                 aio_errors;
+
+    struct XenBlkDev    *blkdev;
+    LIST_ENTRY(ioreq)   list;
+};
+
+struct XenBlkDev {
+    struct XenDevice    xendev;  /* must be first */
+    char                *params;
+    char                *mode;
+    char                *type;
+    char                *dev;
+    char                *devtype;
+    const char          *fileproto;
+    const char          *filename;
+    int                 ring_ref;
+    void                *sring;
+    int64_t             file_blk;
+    int64_t             file_size;
+    int                 protocol;
+    blkif_back_rings_t  rings;
+    int                 more_work;
+    int                 cnt_map;
+
+    /* request lists */
+    LIST_HEAD(inflight_head, ioreq) inflight;
+    LIST_HEAD(finished_head, ioreq) finished;
+    LIST_HEAD(freelist_head, ioreq) freelist;
+    int                 requests_total;
+    int                 requests_inflight;
+    int                 requests_finished;
+
+    /* qemu block driver */
+    int                 index;
+    BlockDriverState    *bs;
+    QEMUBH              *bh;
+};
+
+/* ------------------------------------------------------------- */
+
+static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
+{
+    struct ioreq *ioreq = NULL;
+
+    if (LIST_EMPTY(&blkdev->freelist)) {
+       if (blkdev->requests_total >= max_requests)
+           goto out;
+       /* allocate new struct */
+       ioreq = qemu_mallocz(sizeof(*ioreq));
+       ioreq->blkdev = blkdev;
+       blkdev->requests_total++;
+        qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+    } else {
+       /* get one from freelist */
+       ioreq = LIST_FIRST(&blkdev->freelist);
+       LIST_REMOVE(ioreq, list);
+        qemu_iovec_reset(&ioreq->v);
+    }
+    LIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
+    blkdev->requests_inflight++;
+
+out:
+    return ioreq;
+}
+
+static void ioreq_finish(struct ioreq *ioreq)
+{
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+
+    LIST_REMOVE(ioreq, list);
+    LIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
+    blkdev->requests_inflight--;
+    blkdev->requests_finished++;
+}
+
+static void ioreq_release(struct ioreq *ioreq)
+{
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+
+    LIST_REMOVE(ioreq, list);
+    memset(ioreq, 0, sizeof(*ioreq));
+    ioreq->blkdev = blkdev;
+    LIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
+    blkdev->requests_finished--;
+}
+
+/*
+ * translate request into iovec + start offset
+ * do sanity checks along the way
+ */
+static int ioreq_parse(struct ioreq *ioreq)
+{
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+    uintptr_t mem;
+    size_t len;
+    int i;
+
+    xen_be_printf(&blkdev->xendev, 3,
+                 "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 
"\n",
+                 ioreq->req.operation, ioreq->req.nr_segments,
+                 ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
+    switch (ioreq->req.operation) {
+    case BLKIF_OP_READ:
+       ioreq->prot = PROT_WRITE; /* to memory */
+       break;
+    case BLKIF_OP_WRITE_BARRIER:
+       if (!syncwrite)
+           ioreq->presync = ioreq->postsync = 1;
+       /* fall through */
+    case BLKIF_OP_WRITE:
+       ioreq->prot = PROT_READ; /* from memory */
+       if (syncwrite)
+           ioreq->postsync = 1;
+       break;
+    default:
+       xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
+                     ioreq->req.operation);
+       goto err;
+    };
+
+    if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
+        xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
+        goto err;
+    }
+
+    ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
+    for (i = 0; i < ioreq->req.nr_segments; i++) {
+       if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+           xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
+           goto err;
+       }
+       if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
+           xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
+           goto err;
+       }
+       if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
+           xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
+           goto err;
+       }
+
+       ioreq->domids[i] = blkdev->xendev.dom;
+       ioreq->refs[i]   = ioreq->req.seg[i].gref;
+
+       mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
+       len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) 
* blkdev->file_blk;
+        qemu_iovec_add(&ioreq->v, (void*)mem, len);
+    }
+    if (ioreq->start + ioreq->v.size > blkdev->file_size) {
+       xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
+       goto err;
+    }
+    return 0;
+
+err:
+    ioreq->status = BLKIF_RSP_ERROR;
+    return -1;
+}
+
+static void ioreq_unmap(struct ioreq *ioreq)
+{
+    int gnt = ioreq->blkdev->xendev.gnttabdev;
+    int i;
+
+    if (ioreq->v.niov == 0)
+        return;
+    if (batch_maps) {
+       if (!ioreq->pages)
+           return;
+       if (xc_gnttab_munmap(xen_xc, gnt, ioreq->pages, ioreq->v.niov) != 0)
+           xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: 
%s\n",
+                         strerror(errno));
+       ioreq->blkdev->cnt_map -= ioreq->v.niov;
+       ioreq->pages = NULL;
+    } else {
+       for (i = 0; i < ioreq->v.niov; i++) {
+           if (!ioreq->page[i])
+               continue;
+           if (xc_gnttab_munmap(xen_xc, gnt, ioreq->page[i], 1) != 0)
+               xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap 
failed: %s\n",
+                             strerror(errno));
+           ioreq->blkdev->cnt_map--;
+           ioreq->page[i] = NULL;
+       }
+    }
+}
+
+static int ioreq_map(struct ioreq *ioreq)
+{
+    int gnt = ioreq->blkdev->xendev.gnttabdev;
+    int i;
+
+    if (ioreq->v.niov == 0)
+        return 0;
+    if (batch_maps) {
+       ioreq->pages = xc_gnttab_map_grant_refs
+           (xen_xc, gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, 
ioreq->prot);
+       if (ioreq->pages == NULL) {
+           xen_be_printf(&ioreq->blkdev->xendev, 0,
+                         "can't map %d grant refs (%s, %d maps)\n",
+                         ioreq->v.niov, strerror(errno), 
ioreq->blkdev->cnt_map);
+           return -1;
+       }
+       for (i = 0; i < ioreq->v.niov; i++)
+           ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
+               (uintptr_t)ioreq->v.iov[i].iov_base;
+       ioreq->blkdev->cnt_map += ioreq->v.niov;
+    } else  {
+       for (i = 0; i < ioreq->v.niov; i++) {
+           ioreq->page[i] = xc_gnttab_map_grant_ref
+               (xen_xc, gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
+           if (ioreq->page[i] == NULL) {
+               xen_be_printf(&ioreq->blkdev->xendev, 0,
+                             "can't map grant ref %d (%s, %d maps)\n",
+                             ioreq->refs[i], strerror(errno), 
ioreq->blkdev->cnt_map);
+               ioreq_unmap(ioreq);
+               return -1;
+           }
+           ioreq->v.iov[i].iov_base = ioreq->page[i] + 
(uintptr_t)ioreq->v.iov[i].iov_base;
+           ioreq->blkdev->cnt_map++;
+       }
+    }
+    return 0;
+}
+
+static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
+{
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+    int i, rc, len = 0;
+    off_t pos;
+
+    if (ioreq_map(ioreq) == -1)
+       goto err;
+    if (ioreq->presync)
+       bdrv_flush(blkdev->bs);
+
+    switch (ioreq->req.operation) {
+    case BLKIF_OP_READ:
+       pos = ioreq->start;
+       for (i = 0; i < ioreq->v.niov; i++) {
+           rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
+                          ioreq->v.iov[i].iov_base,
+                          ioreq->v.iov[i].iov_len / BLOCK_SIZE);
+           if (rc != 0) {
+               xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len 
%zd)\n",
+                             ioreq->v.iov[i].iov_base,
+                             ioreq->v.iov[i].iov_len);
+               goto err;
+           }
+           len += ioreq->v.iov[i].iov_len;
+           pos += ioreq->v.iov[i].iov_len;
+       }
+       break;
+    case BLKIF_OP_WRITE:
+    case BLKIF_OP_WRITE_BARRIER:
+       pos = ioreq->start;
+       for (i = 0; i < ioreq->v.niov; i++) {
+           rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
+                           ioreq->v.iov[i].iov_base,
+                           ioreq->v.iov[i].iov_len / BLOCK_SIZE);
+           if (rc != 0) {
+               xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len 
%zd)\n",
+                             ioreq->v.iov[i].iov_base,
+                             ioreq->v.iov[i].iov_len);
+               goto err;
+           }
+           len += ioreq->v.iov[i].iov_len;
+           pos += ioreq->v.iov[i].iov_len;
+       }
+       break;
+    default:
+       /* unknown operation (shouldn't happen -- parse catches this) */
+       goto err;
+    }
+
+    if (ioreq->postsync)
+       bdrv_flush(blkdev->bs);
+    ioreq->status = BLKIF_RSP_OKAY;
+
+    ioreq_unmap(ioreq);
+    ioreq_finish(ioreq);
+    return 0;
+
+err:
+    ioreq->status = BLKIF_RSP_ERROR;
+    return -1;
+}
+
+static void qemu_aio_complete(void *opaque, int ret)
+{
+    struct ioreq *ioreq = opaque;
+
+    if (ret != 0) {
+        xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
+                      ioreq->req.operation == BLKIF_OP_READ ? "read" : 
"write");
+        ioreq->aio_errors++;
+    }
+
+    ioreq->aio_inflight--;
+    if (ioreq->aio_inflight > 0)
+        return;
+
+    ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
+    ioreq_unmap(ioreq);
+    ioreq_finish(ioreq);
+    qemu_bh_schedule(ioreq->blkdev->bh);
+}
+
+static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
+{
+    struct XenBlkDev *blkdev = ioreq->blkdev;
+
+    if (ioreq_map(ioreq) == -1)
+       goto err;
+
+    ioreq->aio_inflight++;
+    if (ioreq->presync)
+       bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
+
+    switch (ioreq->req.operation) {
+    case BLKIF_OP_READ:
+        ioreq->aio_inflight++;
+        bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
+                       &ioreq->v, ioreq->v.size / BLOCK_SIZE,
+                       qemu_aio_complete, ioreq);
+       break;
+    case BLKIF_OP_WRITE:
+    case BLKIF_OP_WRITE_BARRIER:
+        ioreq->aio_inflight++;
+        bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
+                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
+                        qemu_aio_complete, ioreq);
+       break;
+    default:
+       /* unknown operation (shouldn't happen -- parse catches this) */
+       goto err;
+    }
+
+    if (ioreq->postsync)
+       bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
+    qemu_aio_complete(ioreq, 0);
+
+    return 0;
+
+err:
+    ioreq->status = BLKIF_RSP_ERROR;
+    return -1;
+}
+
+static int blk_send_response_one(struct ioreq *ioreq)
+{
+    struct XenBlkDev  *blkdev = ioreq->blkdev;
+    int               send_notify   = 0;
+    int               have_requests = 0;
+    blkif_response_t  resp;
+    void              *dst;
+
+    resp.id        = ioreq->req.id;
+    resp.operation = ioreq->req.operation;
+    resp.status    = ioreq->status;
+
+    /* Place on the response ring for the relevant domain. */
+    switch (blkdev->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+       dst = RING_GET_RESPONSE(&blkdev->rings.native, 
blkdev->rings.native.rsp_prod_pvt);
+       break;
+    case BLKIF_PROTOCOL_X86_32:
+       dst = RING_GET_RESPONSE(&blkdev->rings.x86_32, 
blkdev->rings.x86_32.rsp_prod_pvt);
+       break;
+    case BLKIF_PROTOCOL_X86_64:
+       dst = RING_GET_RESPONSE(&blkdev->rings.x86_64, 
blkdev->rings.x86_64.rsp_prod_pvt);
+       break;
+    default:
+       dst = NULL;
+    }
+    memcpy(dst, &resp, sizeof(resp));
+    blkdev->rings.common.rsp_prod_pvt++;
+
+    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
+    if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
+       /*
+        * Tail check for pending requests. Allows frontend to avoid
+        * notifications if requests are already in flight (lower
+        * overheads and promotes batching).
+        */
+       RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
+    } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
+       have_requests = 1;
+    }
+
+    if (have_requests)
+       blkdev->more_work++;
+    return send_notify;
+}
+
+/* walk finished list, send outstanding responses, free requests */
+static void blk_send_response_all(struct XenBlkDev *blkdev)
+{
+    struct ioreq *ioreq;
+    int send_notify = 0;
+
+    while (!LIST_EMPTY(&blkdev->finished)) {
+        ioreq = LIST_FIRST(&blkdev->finished);
+       send_notify += blk_send_response_one(ioreq);
+       ioreq_release(ioreq);
+    }
+    if (send_notify)
+       xen_be_send_notify(&blkdev->xendev);
+}
+
+static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, 
RING_IDX rc)
+{
+    switch (blkdev->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+       memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
+              sizeof(ioreq->req));
+       break;
+    case BLKIF_PROTOCOL_X86_32:
+       blkif_get_x86_32_req(&ioreq->req, 
RING_GET_REQUEST(&blkdev->rings.x86_32, rc));
+       break;
+    case BLKIF_PROTOCOL_X86_64:
+       blkif_get_x86_64_req(&ioreq->req, 
RING_GET_REQUEST(&blkdev->rings.x86_64, rc));
+       break;
+    }
+    return 0;
+}
+
+static void blk_handle_requests(struct XenBlkDev *blkdev)
+{
+    RING_IDX rc, rp;
+    struct ioreq *ioreq;
+
+    blkdev->more_work = 0;
+
+    rc = blkdev->rings.common.req_cons;
+    rp = blkdev->rings.common.sring->req_prod;
+    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+    if (use_aio)
+        blk_send_response_all(blkdev);
+    while ((rc != rp)) {
+        /* pull request from ring */
+        if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc))
+            break;
+        ioreq = ioreq_start(blkdev);
+        if (ioreq == NULL) {
+            blkdev->more_work++;
+            break;
+        }
+        blk_get_request(blkdev, ioreq, rc);
+        blkdev->rings.common.req_cons = ++rc;
+
+        /* parse them */
+        if (ioreq_parse(ioreq) != 0) {
+            if (blk_send_response_one(ioreq))
+                xen_be_send_notify(&blkdev->xendev);
+            ioreq_release(ioreq);
+            continue;
+        }
+
+        if (use_aio) {
+            /* run i/o in aio mode */
+            ioreq_runio_qemu_aio(ioreq);
+        } else {
+            /* run i/o in sync mode */
+            ioreq_runio_qemu_sync(ioreq);
+        }
+    }
+    if (!use_aio)
+        blk_send_response_all(blkdev);
+
+    if (blkdev->more_work && blkdev->requests_inflight < max_requests)
+        qemu_bh_schedule(blkdev->bh);
+}
+
+/* ------------------------------------------------------------- */
+
+static void blk_bh(void *opaque)
+{
+    struct XenBlkDev *blkdev = opaque;
+    blk_handle_requests(blkdev);
+}
+
+static void blk_alloc(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+
+    LIST_INIT(&blkdev->inflight);
+    LIST_INIT(&blkdev->finished);
+    LIST_INIT(&blkdev->freelist);
+    blkdev->bh = qemu_bh_new(blk_bh, blkdev);
+    if (xen_mode != XEN_EMULATE)
+        batch_maps = 1;
+}
+
+static int blk_init(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+    int mode, qflags, have_barriers, info = 0;
+    char *h;
+
+    /* read xenstore entries */
+    if (blkdev->params == NULL) {
+       blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
+        h = strchr(blkdev->params, ':');
+       if (h != NULL) {
+           blkdev->fileproto = blkdev->params;
+           blkdev->filename  = h+1;
+           *h = 0;
+       } else {
+           blkdev->fileproto = "<unset>";
+           blkdev->filename  = blkdev->params;
+       }
+    }
+    if (!strcmp("aio", blkdev->fileproto))
+        blkdev->fileproto = "raw";
+    if (blkdev->mode == NULL)
+       blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
+    if (blkdev->type == NULL)
+       blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
+    if (blkdev->dev == NULL)
+       blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
+    if (blkdev->devtype == NULL)
+       blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
+
+    /* do we have all we need? */
+    if (blkdev->params == NULL ||
+       blkdev->mode == NULL   ||
+       blkdev->type == NULL   ||
+       blkdev->dev == NULL)
+       return -1;
+
+    /* read-only ? */
+    if (strcmp(blkdev->mode, "w") == 0) {
+       mode   = O_RDWR;
+       qflags = BDRV_O_RDWR;
+    } else {
+       mode   = O_RDONLY;
+       qflags = BDRV_O_RDONLY;
+       info  |= VDISK_READONLY;
+    }
+
+    /* cdrom ? */
+    if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom"))
+       info  |= VDISK_CDROM;
+
+    /* init qemu block driver */
+    blkdev->index = (blkdev->xendev.dev - 202 * 256) / 16;
+    blkdev->index = drive_get_index(IF_XEN, 0, blkdev->index);
+    if (blkdev->index == -1) {
+        /* setup via xenbus -> create new block driver instance */
+        xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
+       blkdev->bs = bdrv_new(blkdev->dev);
+       if (blkdev->bs) {
+           if (bdrv_open2(blkdev->bs, blkdev->filename, qflags,
+                           bdrv_find_format(blkdev->fileproto)) != 0) {
+               bdrv_delete(blkdev->bs);
+               blkdev->bs = NULL;
+           }
+       }
+       if (!blkdev->bs)
+           return -1;
+    } else {
+        /* setup via qemu cmdline -> already setup for us */
+        xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline 
setup)\n");
+       blkdev->bs = drives_table[blkdev->index].bdrv;
+    }
+    blkdev->file_blk  = BLOCK_SIZE;
+    blkdev->file_size = bdrv_getlength(blkdev->bs);
+    if (blkdev->file_size < 0) {
+        xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n",
+                      (int)blkdev->file_size, strerror(-blkdev->file_size),
+                      blkdev->bs->drv ? blkdev->bs->drv->format_name : "-");
+       blkdev->file_size = 0;
+    }
+    have_barriers = blkdev->bs->drv && blkdev->bs->drv->bdrv_flush ? 1 : 0;
+
+    xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
+                 " size %" PRId64 " (%" PRId64 " MB)\n",
+                 blkdev->type, blkdev->fileproto, blkdev->filename,
+                 blkdev->file_size, blkdev->file_size >> 20);
+
+    /* fill info */
+    xenstore_write_be_int(&blkdev->xendev, "feature-barrier", have_barriers);
+    xenstore_write_be_int(&blkdev->xendev, "info",            info);
+    xenstore_write_be_int(&blkdev->xendev, "sector-size",     
blkdev->file_blk);
+    xenstore_write_be_int(&blkdev->xendev, "sectors",
+                         blkdev->file_size / blkdev->file_blk);
+    return 0;
+}
+
+static int blk_connect(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+
+    if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) 
== -1)
+       return -1;
+    if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
+                             &blkdev->xendev.remote_port) == -1)
+       return -1;
+
+    blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
+    if (blkdev->xendev.protocol) {
+        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0)
+            blkdev->protocol = BLKIF_PROTOCOL_X86_32;
+        if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0)
+            blkdev->protocol = BLKIF_PROTOCOL_X86_64;
+    }
+
+    blkdev->sring = xc_gnttab_map_grant_ref(xen_xc, blkdev->xendev.gnttabdev,
+                                           blkdev->xendev.dom,
+                                           blkdev->ring_ref,
+                                           PROT_READ | PROT_WRITE);
+    if (!blkdev->sring)
+       return -1;
+    blkdev->cnt_map++;
+
+    switch (blkdev->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+    {
+       blkif_sring_t *sring_native = blkdev->sring;
+       BACK_RING_INIT(&blkdev->rings.native, sring_native, XC_PAGE_SIZE);
+       break;
+    }
+    case BLKIF_PROTOCOL_X86_32:
+    {
+       blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
+       BACK_RING_INIT(&blkdev->rings.x86_32, sring_x86_32, XC_PAGE_SIZE);
+       break;
+    }
+    case BLKIF_PROTOCOL_X86_64:
+    {
+       blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
+       BACK_RING_INIT(&blkdev->rings.x86_64, sring_x86_64, XC_PAGE_SIZE);
+       break;
+    }
+    }
+
+    xen_be_bind_evtchn(&blkdev->xendev);
+
+    xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
+                 "remote port %d, local port %d\n",
+                 blkdev->xendev.protocol, blkdev->ring_ref,
+                 blkdev->xendev.remote_port, blkdev->xendev.local_port);
+    return 0;
+}
+
+static void blk_disconnect(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+
+    if (blkdev->bs) {
+        if (blkdev->index == -1) {
+            /* close/delete only if we created it ourself */
+            bdrv_close(blkdev->bs);
+            bdrv_delete(blkdev->bs);
+        }
+       blkdev->bs = NULL;
+    }
+    xen_be_unbind_evtchn(&blkdev->xendev);
+
+    if (blkdev->sring) {
+       xc_gnttab_munmap(xen_xc, blkdev->xendev.gnttabdev, blkdev->sring, 1);
+       blkdev->cnt_map--;
+       blkdev->sring = NULL;
+    }
+}
+
+static int blk_free(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+    struct ioreq *ioreq;
+
+    while (!LIST_EMPTY(&blkdev->freelist)) {
+       ioreq = LIST_FIRST(&blkdev->freelist);
+        LIST_REMOVE(ioreq, list);
+        qemu_iovec_destroy(&ioreq->v);
+       qemu_free(ioreq);
+    }
+
+    qemu_free(blkdev->params);
+    qemu_free(blkdev->mode);
+    qemu_free(blkdev->type);
+    qemu_free(blkdev->dev);
+    qemu_free(blkdev->devtype);
+    qemu_bh_delete(blkdev->bh);
+    return 0;
+}
+
+static void blk_event(struct XenDevice *xendev)
+{
+    struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+
+    qemu_bh_schedule(blkdev->bh);
+}
+
+struct XenDevOps xen_blkdev_ops = {
+    .size       = sizeof(struct XenBlkDev),
+    .flags      = DEVOPS_FLAG_NEED_GNTDEV,
+    .alloc      = blk_alloc,
+    .init       = blk_init,
+    .initialise    = blk_connect,
+    .disconnect = blk_disconnect,
+    .event      = blk_event,
+    .free       = blk_free,
+};
diff --git a/hw/xen_machine_fv.c b/hw/xen_machine_fv.c
index 79880a8..7eb3792 100644
--- a/hw/xen_machine_fv.c
+++ b/hw/xen_machine_fv.c
@@ -368,6 +368,7 @@ static void xen_init_fv(ram_addr_t ram_size, int 
vga_ram_size,
         exit(1);
     }
     xen_be_register("console", &xen_console_ops);
+    xen_be_register("qdisk", &xen_blkdev_ops);
 
     pc_machine.init(ram_size, vga_ram_size, boot_device,
                    kernel_filename, kernel_cmdline, initrd_filename,
diff --git a/hw/xen_machine_pv.c b/hw/xen_machine_pv.c
index 34fe4d4..b2475ba 100644
--- a/hw/xen_machine_pv.c
+++ b/hw/xen_machine_pv.c
@@ -69,6 +69,7 @@ static void xen_init_pv(ram_addr_t ram_size, int vga_ram_size,
     xen_be_register("console", &xen_console_ops);
     xen_be_register("vkbd", &xen_kbdmouse_ops);
     xen_be_register("vfb", &xen_framebuffer_ops);
+    xen_be_register("qdisk", &xen_blkdev_ops);
 
     /* setup framebuffer */
     xen_init_display(xen_domid);
diff --git a/sysemu.h b/sysemu.h
index 87b278b..66b8ab2 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -127,7 +127,7 @@ extern unsigned int nb_prom_envs;
 
 typedef enum {
     IF_BLKTAP,
-    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO
+    IF_IDE, IF_SCSI, IF_FLOPPY, IF_PFLASH, IF_MTD, IF_SD, IF_VIRTIO, IF_XEN
 } BlockInterfaceType;
 
 typedef enum {
diff --git a/vl.c b/vl.c
index a90da8c..2fb5f82 100644
--- a/vl.c
+++ b/vl.c
@@ -2396,6 +2396,9 @@ int drive_init(struct drive_opt *arg, int snapshot, void 
*opaque)
        } else if (!strcmp(buf, "mtd")) {
            type = IF_MTD;
             max_devs = 0;
+       } else if (!strcmp(buf, "xen")) {
+           type = IF_XEN;
+            max_devs = 0;
        } else if (!strcmp(buf, "sd")) {
            type = IF_SD;
             max_devs = 0;
@@ -2615,6 +2618,7 @@ int drive_init(struct drive_opt *arg, int snapshot, void 
*opaque)
 
     switch(type) {
     case IF_IDE:
+    case IF_XEN:
     case IF_SCSI:
         switch(media) {
        case MEDIA_DISK:
diff --git a/xen-hooks.mak b/xen-hooks.mak
index 93f4402..891d1aa 100644
--- a/xen-hooks.mak
+++ b/xen-hooks.mak
@@ -30,6 +30,7 @@ OBJS += xen_machine_pv.o
 OBJS += xen_backend.o
 OBJS += xenfb.o
 OBJS += xen_console.o
+OBJS += xen_disk.o
 OBJS += xen_machine_fv.o
 OBJS += exec-dm.o
 OBJS += pci_emulation.o

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 1/4] qemu-xen: backport xen_disk from upstream qemu