[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 1 of 3] blktap3: Merge from blktap2.5



This patch imports commits from blktap2.5 after blktap3 was forked from it in
July 2011 (4212b833df6321fac4ccabb75f7f9a476553d6d0) until February 2013
(ce9f9ce8529ac7fc330371c8ec1efe018da637e0). There are a few more to be merged.

The patch introduces (and builds) the following:
 * the part utility
 * the lvm utility
 * the tap-ctl utility
 * vhd-util-XXX utilities
 * the mirroring functionality (NBD)
 * the pause, unpause, and stats commands
 * various block drivers (mostly in unknown state)
 * bug fixes

Singed-off-by: Thanos Makatos <thanos.makatos@xxxxxxxxxx>

diff --git a/tools/blktap3/Makefile b/tools/blktap3/Makefile
--- a/tools/blktap3/Makefile
+++ b/tools/blktap3/Makefile
@@ -10,6 +10,8 @@ SUBDIRS-y += vhd
 SUBDIRS-y += control
 SUBDIRS-y += tapback
 SUBDIRS-y += drivers
+SUBDIRS-y += part
+SUBDIRS-y += lvm
 
 tags:
        ctags -R --language-force=C --c-kinds=+px
diff --git a/tools/blktap3/control/Makefile b/tools/blktap3/control/Makefile
--- a/tools/blktap3/control/Makefile
+++ b/tools/blktap3/control/Makefile
@@ -5,6 +5,8 @@ MAJOR              = 3
 MINOR              = 0
 LIBNAME            = libblktapctl
 
+IBIN               = tap3-ctl
+
 override CFLAGS += \
        -I../include \
        -DTAPDISK_BUILDDIR='"../drivers"' \
@@ -16,7 +18,7 @@ override CFLAGS += \
     -Wextra \
     -Werror
 
-# FIXME cause trouble
+# TODO cause trouble
 override CFLAGS += \
     -Wno-type-limits \
     -Wno-missing-field-initializers \
@@ -31,6 +33,9 @@ CTL_OBJS  += tap-ctl-open.o
 CTL_OBJS  += tap-ctl-close.o
 CTL_OBJS  += tap-ctl-create.o
 CTL_OBJS  += tap-ctl-destroy.o
+CTL_OBJS  += tap-ctl-pause.o
+CTL_OBJS  += tap-ctl-unpause.o
+CTL_OBJS  += tap-ctl-stats.o
 
 CTL_PICS  = $(patsubst %.o,%.opic,$(CTL_OBJS))
 
@@ -50,15 +55,18 @@ build: $(IBIN) $(LIB_STATIC) $(LIB_SHARE
        $(CC) $(LDFLAGS) -fPIC -Wl,$(SONAME_LDFLAG) -Wl,$(LIB_SHARED) \
                $(SHLIB_LDFLAGS) -rdynamic $^ -o $@
 
-install: $(LIB_STATIC) $(LIB_SHARED)
-       $(INSTALL_DIR) -p $(DESTDIR)$(SBINDIR)
-       # TODO Why install the static version?
-       #$(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(LIBDIR)
+tap3-ctl: tap-ctl.o $(LIB_SHARED)
+       $(CC) $(LDFLAGS) -o $@ $^
+
+install: $(LIB_SHARED) $(IBIN)
+       $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR)
        $(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(LIBDIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR)
        ldconfig
 
 clean:
-       rm -f $(CTL_OBJS) $(PICS) $(DEPS) $(LIB_STATIC) $(LIB_SHARED)
+       rm -f $(CTL_OBJS) $(PICS) $(DEPS) $(LIB_STATIC) $(LIB_SHARED) $(IBIN)\
+        tap-ctl.o
 
 .PHONY: all build clean install
 
diff --git a/tools/blktap3/control/tap-ctl-destroy.c 
b/tools/blktap3/control/tap-ctl-destroy.c
--- a/tools/blktap3/control/tap-ctl-destroy.c
+++ b/tools/blktap3/control/tap-ctl-destroy.c
@@ -1,15 +1,29 @@
 /*
- * Copyright (C) 2012      Citrix Ltd.
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; version 2.1 only. with the special
- * exception on linking described in file LICENSE.
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <stdio.h>
diff --git a/tools/blktap3/control/tap-ctl-ipc.c 
b/tools/blktap3/control/tap-ctl-ipc.c
--- a/tools/blktap3/control/tap-ctl-ipc.c
+++ b/tools/blktap3/control/tap-ctl-ipc.c
@@ -154,7 +154,8 @@ tap_ctl_send_and_receive(const int sfd, 
                return err;
        }
 
-    if (tapdisk_message_is_rsp_paired(msg_type)) {
+    if (TAPDISK_MESSAGE_ERROR != message->type
+            && tapdisk_message_is_rsp_paired(msg_type)) {
                if (message->type - msg_type != 1) {
                        err = EINVAL;
                        EPRINTF("invalid response '%s' to message '%s'\n",
@@ -190,7 +191,7 @@ tap_ctl_connect(const char *name, int *s
 
        fd = socket(AF_UNIX, SOCK_STREAM, 0);
        if (fd == -1) {
-               EPRINTF("couldn't create socket for %s: %d\n", name, errno);
+               EPRINTF("couldn't create socket for %s: %s\n", name, 
strerror(errno));
                return -errno;
        }
 
@@ -200,9 +201,10 @@ tap_ctl_connect(const char *name, int *s
 
        err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr));
        if (err) {
-               EPRINTF("couldn't connect to %s: %d\n", name, errno);
+        err = errno;
+               EPRINTF("couldn't connect to %s: %s\n", name, strerror(err));
                close(fd);
-               return -errno;
+               return -err;
        }
 
        *sfd = fd;
@@ -229,6 +231,7 @@ tap_ctl_connect_id(int id, int *sfd)
        }
 
        err = tap_ctl_connect(name, sfd);
+
        free(name);
 
        return err;
diff --git a/tools/blktap3/control/tap-ctl-open.c 
b/tools/blktap3/control/tap-ctl-open.c
--- a/tools/blktap3/control/tap-ctl-open.c
+++ b/tools/blktap3/control/tap-ctl-open.c
@@ -45,7 +45,7 @@ tap_ctl_open(const int pid, const char *
     memset(&message, 0, sizeof(message));
     message.type = TAPDISK_MESSAGE_OPEN;
     if (prt_path) {
-        if (strnlen(prt_path, TAPDISK_MESSAGE_OPEN) == TAPDISK_MESSAGE_OPEN)
+        if (strlen(prt_path) >= TAPDISK_MESSAGE_OPEN)
             return -ENAMETOOLONG;
         strcpy(message.u.params.prt_path, prt_path);
     }
diff --git a/tools/blktap3/control/tap-ctl-pause.c 
b/tools/blktap3/control/tap-ctl-pause.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/control/tap-ctl-pause.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <assert.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_pause(const int id, const char *params, struct timeval *timeout)
+{
+       int err;
+       tapdisk_message_t message;
+
+    assert(params);
+
+    if (strnlen(params, TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+            >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+        return ENAMETOOLONG;
+    }
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_PAUSE;
+
+       strncpy(message.u.params.path, params, TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+
+       err = tap_ctl_connect_send_and_receive(id, &message, timeout);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_PAUSE_RSP
+            || message.type == TAPDISK_MESSAGE_ERROR)
+               err = message.u.response.error;
+    else {
+               err = EINVAL;
+               EPRINTF("got unexpected message '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+       }
+
+       return err;
+}
diff --git a/tools/blktap3/control/tap-ctl-stats.c 
b/tools/blktap3/control/tap-ctl-stats.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/control/tap-ctl-stats.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2010, Citrix 
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+#include "tap-ctl.h"
+
+int
+_tap_ctl_stats_connect_and_send(pid_t pid, const char *params)
+{
+       struct timeval timeout = { .tv_sec = 10, .tv_usec = 0 };
+       tapdisk_message_t message;
+       int sfd, err;
+
+    assert(params);
+
+    if (strnlen(params, TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+            >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+        return ENAMETOOLONG;
+    }
+
+       memset(&message, 0, sizeof(message));
+       message.type   = TAPDISK_MESSAGE_STATS;
+
+       strncpy(message.u.params.path, params, TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+
+       err = tap_ctl_connect_id(pid, &sfd);
+       if (err)
+               return err;
+
+
+       err = tap_ctl_write_message(sfd, &message, &timeout);
+       if (err)
+               return err;
+
+       return sfd;
+}
+
+ssize_t
+tap_ctl_stats(pid_t pid, const char *params, char *buf, size_t size)
+{
+       tapdisk_message_t message;
+       int sfd, err;
+       size_t len;
+
+    assert(params);
+
+       sfd = _tap_ctl_stats_connect_and_send(pid, params);
+       if (sfd < 0)
+               return sfd;
+
+       err = tap_ctl_read_message(sfd, &message, NULL);
+       if (err)
+               return err;
+
+       len= message.u.info.length;
+       if (len < 0) {
+               err = len;
+               goto out;
+       }
+       if (size < len + 1)
+               len = size - 1;
+
+       err = tap_ctl_read_raw(sfd, buf, len, NULL);
+       if (err)
+               goto out;
+
+       buf[len] = 0;
+
+out:
+       close(sfd);
+       return err;
+}
+
+int
+tap_ctl_stats_fwrite(pid_t pid, const char *params, FILE *stream)
+{
+       tapdisk_message_t message;
+       int sfd = -1, prot, flags, err;
+       size_t len, bufsz;
+       char *buf = MAP_FAILED;
+
+    assert(params);
+
+       prot  = PROT_READ|PROT_WRITE;
+       flags = MAP_ANONYMOUS|MAP_PRIVATE;
+       bufsz = sysconf(_SC_PAGE_SIZE);
+
+       buf = mmap(NULL, bufsz, prot, flags, -1, 0);
+       if (buf == MAP_FAILED) {
+               buf = NULL;
+               err = -ENOMEM;
+               goto out;
+       }
+
+       sfd = _tap_ctl_stats_connect_and_send(pid, params);
+       if (sfd < 0) {
+               err = sfd;
+               goto out;
+       }
+
+       err = tap_ctl_read_message(sfd, &message, NULL);
+       if (err)
+               goto out;
+
+       len = message.u.info.length;
+       if (len < 0) {
+               err = len;
+               goto out;
+       }
+
+       while (len) {
+               fd_set rfds;
+               size_t in, out;
+               int n;
+
+               FD_ZERO(&rfds);
+               FD_SET(sfd, &rfds);
+
+               n = select(sfd + 1, &rfds, NULL, NULL, NULL);
+               if (n < 0) {
+                       err = n;
+                       goto out;
+               }
+
+               in = read(sfd, buf, bufsz);
+               if (in <= 0) {
+                       err = in;
+                       goto out;
+               }
+
+               len -= in;
+
+               out = fwrite(buf, in, 1, stream);
+               if (out != 1) {
+                       err = -errno;
+                       goto out;
+               }
+       }
+       len = fwrite("\n", 1, 1, stream);
+
+out:
+       if (sfd >= 0)
+               close(sfd);
+       if (buf != MAP_FAILED)
+               munmap(buf, bufsz);
+
+       return err;
+}
diff --git a/tools/blktap3/control/tap-ctl-unpause.c 
b/tools/blktap3/control/tap-ctl-unpause.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/control/tap-ctl-unpause.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include "tap-ctl.h"
+
+int
+tap_ctl_unpause(const int id, const char *params1, const char *params2,
+        int flags, char *secondary)
+{
+       int err;
+       tapdisk_message_t message;
+
+    assert(params1);
+
+       memset(&message, 0, sizeof(message));
+       message.type = TAPDISK_MESSAGE_RESUME;
+       message.u.resume.flags = flags;
+
+    if (strnlen(params1, TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+            >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+        /* TODO log error */
+        return ENAMETOOLONG;
+    }
+
+       strncpy(message.u.resume.params1, params1, 
TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+
+    if (params2) {
+        if (strnlen(params2, TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+                >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+            /* TODO log error */
+            return ENAMETOOLONG;
+        }
+           strncpy(message.u.resume.params2, params2,
+                TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+    } else {
+        message.u.resume.params2[0] = '\0';
+    }
+
+    if (secondary) {
+        if (strnlen(secondary, TAPDISK_MESSAGE_MAX_PATH_LENGTH)
+                >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+            /* TODO log error */
+            return ENAMETOOLONG;
+        }
+           strncpy(message.u.resume.secondary, secondary,
+                TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+    } else {
+        message.u.resume.secondary[0] = '\0';
+    }
+
+       err = tap_ctl_connect_send_and_receive(id, &message, NULL);
+       if (err)
+               return err;
+
+       if (message.type == TAPDISK_MESSAGE_RESUME_RSP
+            || message.type == TAPDISK_MESSAGE_ERROR)
+               err = message.u.response.error;
+       else {
+               err = EINVAL;
+               EPRINTF("got unexpected result '%s' from %d\n",
+                       tapdisk_message_name(message.type), id);
+       }
+
+       return err;
+}
diff --git a/tools/blktap3/control/tap-ctl.c b/tools/blktap3/control/tap-ctl.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/control/tap-ctl.c
@@ -0,0 +1,707 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <signal.h>
+#include <sys/time.h>
+
+#include "tap-ctl.h"
+
+typedef int (*tap_ctl_func_t) (int, char **);
+
+struct command {
+       char                     *name;
+       tap_ctl_func_t            func;
+};
+
+static void
+tap_cli_list_usage(FILE *stream)
+{
+       fprintf(stream,
+               "usage: list [-h] [-p pid] [-t type] [-f file]\n");
+}
+
+static void
+tap_cli_list_row(tap_list_t *entry)
+{
+       char minor_str[10] = "-";
+       char state_str[10] = "-";
+       char pid_str[10]   = "-";
+
+       if (entry->pid != -1)
+               sprintf(pid_str, "%d", entry->pid);
+
+       if (entry->state != -1)
+               sprintf(state_str, "%#x", entry->state);
+
+       printf("%8s %4s %4s %10s %s\n",
+              pid_str, minor_str, state_str,
+              entry->type ? : "-", entry->path ? : "-");
+}
+
+static void
+tap_cli_list_dict(tap_list_t *entry)
+{
+       int d = 0;
+
+       if (entry->pid != -1) {
+               if (d) putc(' ', stdout);
+               d = printf("pid=%d", entry->pid);
+       }
+
+       if (entry->state != -1) {
+               if (d) putc(' ', stdout);
+               d = printf("state=%#x", entry->state);
+       }
+
+       if (entry->type && entry->path) {
+               if (d) putc(' ', stdout);
+               d = printf("args=%s:%s", entry->type, entry->path);
+       }
+
+       putc('\n', stdout);
+}
+
+int
+tap_cli_list(int argc, char **argv)
+{
+       struct tqh_tap_list list = TAILQ_HEAD_INITIALIZER(list);
+       int c, minor, tty, err;
+       const char *type, *file;
+       tap_list_t *entry;
+       pid_t pid;
+
+       pid   = -1;
+       minor = -1;
+       type  = NULL;
+       file  = NULL;
+
+       while ((c = getopt(argc, argv, "m:p:t:f:h")) != -1) {
+               switch (c) {
+               case 'm':
+                       minor = atoi(optarg);
+                       break;
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 't':
+                       type = optarg;
+                       break;
+               case 'f':
+                       file = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_list_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid != -1)
+               err = tap_ctl_list_pid(pid, &list);
+       else
+               err = tap_ctl_list(&list);
+       if (err)
+               return -err;
+
+       tty = isatty(STDOUT_FILENO);
+
+       tap_list_for_each_entry(entry, &list) {
+               if (pid >= 0 && entry->pid != pid)
+                       continue;
+
+               if (type && entry->type && strcmp(entry->type, type))
+                       continue;
+
+               if (file && entry->path && strcmp(entry->path, file))
+                       continue;
+
+               if (tty)
+                       tap_cli_list_row(entry);
+               else
+                       tap_cli_list_dict(entry);
+       }
+
+       tap_ctl_list_free(&list);
+
+       return 0;
+
+usage:
+       tap_cli_list_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_create_usage(FILE *stream)
+{
+    /* FIXME "stack on existing ..." */
+       fprintf(stream, "usage: create <-a type:/path/to/file> [-R readonly] "
+               "[-e <minor> stack on existing tapdisk for the parent chain] "
+               "[-r turn on read caching into leaf node] [-2 <path> "
+               "use secondary image (in mirror mode if no -s)] [-s "
+               "fail over to the secondary image on ENOSPC]\n");
+}
+
+static int
+tap_cli_create(int argc, char **argv)
+{
+       int c, err, flags;
+       char *args, *secondary, *prt_path;
+
+       args      = NULL;
+       secondary = NULL;
+       prt_path  = NULL;
+       flags     = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "a:Rd:e:r2:sh")) != -1) {
+               switch (c) {
+               case 'a':
+                       args = optarg;
+                       break;
+               case 'R':
+                       flags |= TAPDISK_MESSAGE_FLAG_RDONLY;
+                       break;
+               case 'r':
+                       flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE;
+                       break;
+               case 'e':
+                       flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT;
+                       prt_path = optarg;
+                       break;
+               case '2':
+                       flags |= TAPDISK_MESSAGE_FLAG_SECONDARY;
+                       secondary = optarg;
+                       break;
+               case 's':
+                       flags |= TAPDISK_MESSAGE_FLAG_STANDBY;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_create_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (!args)
+               goto usage;
+
+       err = tap_ctl_create(args, flags, prt_path, secondary);
+
+       return err;
+
+usage:
+       tap_cli_create_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_destroy_usage(FILE *stream)
+{
+       fprintf(stream, "usage: destroy <-p pid> <-a type:/path/to/file>\n");
+}
+
+static struct timeval*
+tap_cli_timeout(const char *optarg)
+{
+       static struct timeval tv;
+       struct timeval now;
+
+       tv.tv_sec  = atoi(optarg);
+       tv.tv_usec = 0;
+
+       gettimeofday(&now, NULL);
+       timeradd(&tv, &now, &tv);
+
+       return &tv;
+}
+
+static int
+tap_cli_destroy(int argc, char **argv)
+{
+       int c, pid;
+       struct timeval *timeout;
+    char *params;
+
+       pid     = -1;
+    params  = NULL;
+       timeout = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:a:t:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params = optarg;
+                       break;
+               case 't':
+                       timeout = tap_cli_timeout(optarg);
+                       if (!timeout)
+                               goto usage;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_destroy_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params)
+               goto usage;
+
+       return tap_ctl_destroy(pid, params, 0, timeout);
+
+usage:
+       tap_cli_destroy_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_spawn_usage(FILE *stream)
+{
+       fprintf(stream, "usage: spawn\n");
+}
+
+static int
+tap_cli_spawn(int argc, char **argv)
+{
+       int c, tty;
+       pid_t pid;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch (c) {
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_spawn_usage(stdout);
+                       return 0;
+               }
+       }
+
+       pid = tap_ctl_spawn();
+       if (pid < 0)
+               return pid;
+
+       tty = isatty(STDOUT_FILENO);
+       if (tty)
+               printf("tapdisk spawned with pid %d\n", pid);
+       else
+               printf("%d\n", pid);
+
+       return 0;
+
+usage:
+       tap_cli_spawn_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_close_usage(FILE *stream)
+{
+       fprintf(stream, "usage: close <-p pid> <-a type:/path/to/file> "
+            "[-f force]\n");
+}
+
+static int
+tap_cli_close(int argc, char **argv)
+{
+       int c, pid, force;
+    char *params;
+       struct timeval *timeout;
+
+       pid     = -1;
+    params  = NULL;
+       force   = 0;
+       timeout = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:a:ft:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params = optarg;
+                       break;
+               case 'f':
+                       force = -1;
+                       break;
+               case 't':
+                       timeout = tap_cli_timeout(optarg);
+                       if (!timeout)
+                               goto usage;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_close_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params)
+               goto usage;
+
+       return tap_ctl_close(pid, params, force, timeout);
+
+usage:
+       tap_cli_close_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_pause_usage(FILE *stream)
+{
+       fprintf(stream, "usage: pause <-p pid> <-a type:/path/to/file>\n");
+}
+
+static int
+tap_cli_pause(int argc, char **argv)
+{
+       int c, pid;
+       struct timeval *timeout;
+    char *params;
+
+       pid     = -1;
+    params  = NULL;
+       timeout = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:a:t:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params = optarg;
+                       break;
+               case 't':
+                       timeout = tap_cli_timeout(optarg);
+                       if (!timeout)
+                               goto usage;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_pause_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params)
+               goto usage;
+
+       return tap_ctl_pause(pid, params, timeout);
+
+usage:
+       tap_cli_pause_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_unpause_usage(FILE *stream)
+{
+       fprintf(stream, "usage: unpause <-p pid> <-a type:/path/to/file> "
+                       "[-b type:/path/to/file] [-2 secondary]\n");
+}
+
+int
+tap_cli_unpause(int argc, char **argv)
+{
+       char *secondary, *params1, *params2;
+       int c, pid, flags;
+
+       pid = -1;
+    params1 = NULL;
+    params2 = NULL;
+       secondary = NULL;
+       flags = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:a:b:2:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params1 = optarg;
+                       break;
+               case 'b':
+                       params2 = optarg;
+                       break;
+               case '2':
+                       flags |= TAPDISK_MESSAGE_FLAG_SECONDARY;
+                       secondary = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_unpause_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params1)
+               goto usage;
+
+       return tap_ctl_unpause(pid, params1, params2, flags, secondary);
+
+usage:
+       tap_cli_unpause_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_open_usage(FILE *stream)
+{
+       fprintf(stream, "usage: open <-p pid> <-a type:/path/to/file> "
+        "[-R readonly] [-e <type:/path/to/file> stack on existing tapdisk for "
+        "the parent chain] [-r turn on read caching into leaf node] [-2 "
+        "<path> use secondary image (in mirror mode if no -s)] [-s fail over "
+        "to the secondary image on ENOSPC]\n");
+}
+
+static int
+tap_cli_open(int argc, char **argv)
+{
+       const char *params, *prt_params, *secondary;
+       int c, pid, flags;
+
+       flags = 0;
+       pid = -1;
+    params = NULL;
+    prt_params = NULL;
+       secondary = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "a:R:p:e:r2:sh")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params = optarg;
+                       break;
+               case 'R':
+                       flags |= TAPDISK_MESSAGE_FLAG_RDONLY;
+                       break;
+               case 'r':
+                       flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE;
+                       break;
+               case 'e':
+                       flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT;
+                       prt_params = optarg;
+                       break;
+               case '2':
+                       flags |= TAPDISK_MESSAGE_FLAG_SECONDARY;
+                       secondary = optarg;
+                       break;
+               case 's':
+                       flags |= TAPDISK_MESSAGE_FLAG_STANDBY;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_open_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params)
+               goto usage;
+
+       return tap_ctl_open(pid, params, flags, prt_params, secondary);
+
+usage:
+       tap_cli_open_usage(stderr);
+       return EINVAL;
+}
+
+static void
+tap_cli_stats_usage(FILE *stream)
+{
+       fprintf(stream, "usage: stats <-p pid> <-a type:/path/to/file>\n");
+}
+
+static int
+tap_cli_stats(int argc, char **argv)
+{
+       pid_t pid;
+       int c, err;
+    char *params;
+
+       pid = -1;
+    params = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "p:a:h")) != -1) {
+               switch (c) {
+               case 'p':
+                       pid = atoi(optarg);
+                       break;
+               case 'a':
+                       params = optarg;
+                       break;
+               case '?':
+                       goto usage;
+               case 'h':
+                       tap_cli_stats_usage(stdout);
+                       return 0;
+               }
+       }
+
+       if (pid == -1 || !params)
+               goto usage;
+
+       err = tap_ctl_stats_fwrite(pid, params, stdout);
+       if (err)
+               return err;
+
+       fprintf(stdout, "\n");
+
+       return 0;
+
+usage:
+       tap_cli_stats_usage(stderr);
+       return EINVAL;
+}
+
+struct command commands[] = {
+       { .name = "list",         .func = tap_cli_list          },
+       { .name = "create",       .func = tap_cli_create        },
+       { .name = "destroy",      .func = tap_cli_destroy       },
+       { .name = "spawn",        .func = tap_cli_spawn         },
+       { .name = "open",         .func = tap_cli_open          },
+       { .name = "close",        .func = tap_cli_close         },
+       { .name = "pause",        .func = tap_cli_pause         },
+       { .name = "unpause",      .func = tap_cli_unpause       },
+       { .name = "stats",        .func = tap_cli_stats         },
+};
+
+#define print_commands()                                       \
+       do {                                                    \
+               int i, n;                                       \
+               n = sizeof(commands) / sizeof(struct command);  \
+               printf("COMMAND := { ");                        \
+               printf("%s", commands[0].name);                 \
+               for (i = 1; i < n; i++)                         \
+                       printf(" | %s", commands[i].name);      \
+               printf(" }\n");                                 \
+       } while (0)
+
+void
+help(void)
+{
+       printf("usage: tap-ctl COMMAND [OPTIONS]\n");
+       print_commands();
+       exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i, n;
+
+       if (strnlen(command, 25) >= 25)
+               return NULL;
+
+       n = sizeof(commands) / sizeof (struct command);
+
+       for (i = 0; i < n; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+       #include <sys/resource.h>
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               PERROR("setrlimit failed");
+#endif
+
+       signal(SIGPIPE, SIG_IGN);
+
+       ret = 0;
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               EPRINTF("invalid COMMAND %s", argv[1]);
+               help();
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cnt      = 1;
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++) {
+               char *arg = argv[i + (argc - cargc)];
+
+               if (!strcmp(arg, "--debug")) {
+                       tap_ctl_debug = 1;
+                       continue;
+               }
+
+               cargv[cnt++] = arg;
+       }
+
+       ret = cmd->func(cnt, cargv);
+    if (ret) {
+        /* TODO Some functions return +errno, others -errno, fix. */
+        printf("%s\n", strerror(abs(ret)));
+    }
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap3/control/tap-ctl.h b/tools/blktap3/control/tap-ctl.h
--- a/tools/blktap3/control/tap-ctl.h
+++ b/tools/blktap3/control/tap-ctl.h
@@ -214,12 +214,19 @@ int tap_ctl_destroy(const int id, const 
 /**
  * Pauses the VBD.
  */
-int tap_ctl_pause(const int id, const char * params, struct timeval
+int tap_ctl_pause(const int pid, const char * params, struct timeval
         *timeout);
 /**
  * Unpauses the VBD
+ *
+ * @param pid the process ID of the tapdisk
+ * @param params1 VDI (type:/path/to/file)
+ * @param params2 new VDI to use (type:/path/to/file), optional
+ * @param flags TODO
+ * @param secondary TODO
  */
-int tap_ctl_unpause(const int id, const char * params);
+int tap_ctl_unpause(const int pid, const char * params1, const char *params2,
+        int flags, char *secondary);
 
 ssize_t tap_ctl_stats(pid_t pid, const char * params, char *buf, size_t size);
 int tap_ctl_stats_fwrite(pid_t pid, const char * params, FILE * out);
diff --git a/tools/blktap3/drivers/Makefile b/tools/blktap3/drivers/Makefile
--- a/tools/blktap3/drivers/Makefile
+++ b/tools/blktap3/drivers/Makefile
@@ -11,7 +11,6 @@ LIBVHDDIR  = $(BLKTAP_ROOT)/vhd/lib
 # FIXME tapdisk-client tapdisk-stream tapdisk-diff not in blktap2.5
 IBIN = tapdisk3
 LOCK_UTIL  = lock-util
-INST_DIR   = $(SBINDIR)
 
 override CFLAGS += \
        -fno-strict-aliasing \
@@ -78,15 +77,16 @@ TAP-OBJS-y  += tapdisk-server.o
 TAP-OBJS-y  += tapdisk-queue.o
 TAP-OBJS-y  += tapdisk-filter.o
 TAP-OBJS-y  += tapdisk-utils.o
-TAP-OBJS-y += tapdisk-log.o
+TAP-OBJS-y  += tapdisk-log.o
 TAP-OBJS-y  += io-optimize.o
 #TAP-OBJS-y += lock.o
-#TAP-OBJS-y += tapdisk-blktap.o
-TAP-OBJS-y += tapdisk-stats.o
-TAP-OBJS-y += tapdisk-storage.o
-TAP-OBJS-y += tapdisk-loglimit.o
-TAP-OBJS-y += tapdisk-logfile.o
-TAP-OBJS-y += tapdisk-syslog.o
+TAP-OBJS-y  += tapdisk-stats.o
+TAP-OBJS-y  += tapdisk-storage.o
+TAP-OBJS-y  += tapdisk-loglimit.o
+TAP-OBJS-y  += tapdisk-logfile.o
+TAP-OBJS-y  += tapdisk-syslog.o
+TAP-OBJS-y  += tapdisk-nbdserver.o
+TAP-OBJS-y  += tapdisk-fdreceiver.o
 #TAP-OBJS-y += $(PORTABLE-OBJS-y)
 
 LIBSRING := sring/libsring.a
@@ -96,6 +96,14 @@ LIBSRING := sring/libsring.a
 
 BLK-OBJS-y  := block-aio.o
 BLK-OBJS-y  += block-vhd.o
+BLK-OBJS-y  += block-ram.o
+BLK-OBJS-y  += block-cache.o
+BLK-OJBS-y  += block-vindex.o
+BLK-OBJS-y  += block-lcache.o
+BLK-OBJS-y  += block-llcache.o
+BLK-OBJS-y  += block-valve.o
+BLK-OBJS-y  += block-nbd.o
+BLK-OBJS-y  += block-vindex.o
 # FIXME The following exist in blktap2 but not in blktap2.5.
 #BLK-OBJS-y += aes.o
 #BLK-OBJS-y += md5.o
@@ -125,8 +133,8 @@ lock-util: lock.c
 # FIXME img2qcow, qcow-create, qcow2raw not built so not installed
 # FIXME lock-util should be installed
 install: all
-       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
-       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+       $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR)
 
 clean: subdirs-clean
        rm -rf .*.d *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL)
diff --git a/tools/blktap3/drivers/atomicio.c b/tools/blktap3/drivers/atomicio.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/atomicio.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved.
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+#include "atomicio.h"
+
+/*
+ * ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t
+atomicio(f, fd, _s, n)
+       ssize_t (*f) (int, void *, size_t);
+       int fd;
+       void *_s;
+       size_t n;
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+
+       while (n > pos) {
+               res = (f) (fd, s + pos, n - pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       return 0;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               default:
+                       pos += (size_t)res;
+               }
+       }
+       return (pos);
+}
+
diff --git a/tools/blktap3/drivers/atomicio.h b/tools/blktap3/drivers/atomicio.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/atomicio.h
@@ -0,0 +1,33 @@
+/*     $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $       */
+
+/*
+ * Copyright (c) 1995,1999 Theo de Raadt.  All rights reserved.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ensure all of data on socket comes through. f==read || f==vwrite
+ */
+size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t);
+
+#define vwrite (ssize_t (*)(int, void *, size_t))write
diff --git a/tools/blktap3/drivers/block-aio.c 
b/tools/blktap3/drivers/block-aio.c
--- a/tools/blktap3/drivers/block-aio.c
+++ b/tools/blktap3/drivers/block-aio.c
@@ -240,7 +240,8 @@ void tdaio_queue_write(td_driver_t * dri
        td_complete_request(treq, -EBUSY);
 }
 
-int tdaio_close(td_driver_t * driver)
+int tdaio_close(td_driver_t * driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
 {
        struct tdaio_state *prv = (struct tdaio_state *) driver->data;
 
diff --git a/tools/blktap3/drivers/block-cache.c 
b/tools/blktap3/drivers/block-cache.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-cache.c
@@ -0,0 +1,795 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+
+#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
+
+#define RADIX_TREE_PAGE_SHIFT           12 /* 4K pages */
+#define RADIX_TREE_PAGE_SIZE            (1 << RADIX_TREE_PAGE_SHIFT)
+
+#define RADIX_TREE_NODE_SHIFT           9 /* 512B nodes */
+#define RADIX_TREE_NODE_SIZE            (1 << RADIX_TREE_NODE_SHIFT)
+#define RADIX_TREE_NODE_MASK            (RADIX_TREE_NODE_SIZE - 1)
+
+#define BLOCK_CACHE_NODES_PER_PAGE      (1 << (RADIX_TREE_PAGE_SHIFT - 
RADIX_TREE_NODE_SHIFT))
+
+#define BLOCK_CACHE_MAX_SIZE            (10 << 20) /* 100MB cache */
+#define BLOCK_CACHE_REQUESTS            (TAPDISK_DATA_REQUESTS << 3)
+#define BLOCK_CACHE_PAGE_IDLETIME       60
+
+typedef struct radix_tree               radix_tree_t;
+typedef struct radix_tree_node          radix_tree_node_t;
+typedef struct radix_tree_link          radix_tree_link_t;
+typedef struct radix_tree_leaf          radix_tree_leaf_t;
+typedef struct radix_tree_page          radix_tree_page_t;
+
+typedef struct block_cache              block_cache_t;
+typedef struct block_cache_request      block_cache_request_t;
+typedef struct block_cache_stats        block_cache_stats_t;
+
+struct radix_tree_page {
+       char                           *buf;
+       size_t                          size;
+       uint64_t                        sec;
+       radix_tree_link_t              *owners[BLOCK_CACHE_NODES_PER_PAGE];
+};
+
+struct radix_tree_leaf {
+       radix_tree_page_t              *page;
+       char                           *buf;
+};
+
+struct radix_tree_link {
+       uint32_t                        time;
+       union {
+               radix_tree_node_t      *next;
+               radix_tree_leaf_t       leaf;
+       } u;
+};
+
+struct radix_tree_node {
+       int                             height;
+       radix_tree_link_t               links[RADIX_TREE_NODE_SIZE];
+};
+
+struct radix_tree {
+       int                             height;
+       uint64_t                        size;
+       uint32_t                        nodes;
+       radix_tree_node_t              *root;
+
+       block_cache_t                  *cache;
+};
+
+struct block_cache_request {
+       int                             err;
+       char                           *buf;
+       uint64_t                        secs;
+       td_request_t                    treq;
+       block_cache_t                  *cache;
+};
+
+struct block_cache_stats {
+       uint64_t                        reads;
+       uint64_t                        hits;
+       uint64_t                        misses;
+       uint64_t                        prunes;
+};
+
+struct block_cache {
+       int                             ptype;
+       char                           *name;
+
+       uint64_t                        sectors;
+
+       block_cache_request_t           requests[BLOCK_CACHE_REQUESTS];
+       block_cache_request_t          *request_free_list[BLOCK_CACHE_REQUESTS];
+       int                             requests_free;
+
+       event_id_t                      timeout_id;
+
+       radix_tree_t                    tree;
+
+       block_cache_stats_t             stats;
+};
+
+static inline uint64_t
+radix_tree_calculate_size(int height)
+{
+       return (uint64_t)RADIX_TREE_NODE_SIZE <<
+         (height * RADIX_TREE_NODE_SHIFT);
+}
+
+static inline int
+radix_tree_calculate_height(uint64_t sectors)
+{
+       int height;
+       uint64_t tree_size;
+
+       height = 1;  /* always allocate root node */
+       tree_size = radix_tree_calculate_size(height);
+       while (sectors > tree_size)
+               tree_size = radix_tree_calculate_size(++height);
+
+       return height;
+}
+
+static inline int
+radix_tree_index(radix_tree_node_t *node, uint64_t sector)
+{
+       return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
+               RADIX_TREE_NODE_MASK);
+}
+
+static inline int
+radix_tree_node_contains_leaves(radix_tree_t *tree __attribute__((unused)),
+        radix_tree_node_t *node)
+{
+       return (node->height == 0);
+}
+
+static inline int
+radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       return (node->height == tree->height);
+}
+
+static inline uint64_t
+radix_tree_size(radix_tree_t *tree)
+{
+       return tree->size + tree->nodes * sizeof(radix_tree_node_t);
+}
+
+static inline void
+radix_tree_clear_link(radix_tree_link_t *link)
+{
+       if (link)
+               memset(link, 0, sizeof(radix_tree_link_t));
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_node(radix_tree_t *tree, int height)
+{
+       radix_tree_node_t *node;
+
+       node = calloc(1, sizeof(radix_tree_node_t));
+       if (!node)
+               return NULL;
+
+       node->height = height;
+       tree->nodes++;
+
+       return node;
+}
+
+static inline radix_tree_node_t *
+radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
+{
+       return radix_tree_allocate_node(tree, parent->height - 1);
+}
+
+void
+radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       if (!node)
+               return;
+
+       free(node);
+       tree->nodes--;
+}
+
+static inline radix_tree_page_t *
+radix_tree_allocate_page(radix_tree_t *tree,
+                        char *buf, uint64_t sec, size_t size)
+{
+       radix_tree_page_t *page;
+
+       page = calloc(1, sizeof(radix_tree_page_t));
+       if (!page)
+               return NULL;
+
+       page->buf   = buf;
+       page->sec   = sec;
+       page->size  = size;
+       tree->size += size;
+
+       return page;
+}
+
+static inline void
+radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
+               DBG("%s: ejecting sector 0x%llx\n",
+                   tree->cache->name, page->sec + i);
+
+       tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
+       tree->size -= page->size;
+       free(page->buf);
+       free(page);
+}
+
+/*
+ * remove a leaf and the shared radix_tree_page_t containing its buffer.
+ * leaves are deleted, nodes are not; gc will reap the nodes later.
+ */
+static void
+radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
+{
+       int i;
+
+       if (!page)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
+               radix_tree_clear_link(page->owners[i]);
+
+       radix_tree_free_page(tree, page);
+}
+
+static void
+radix_tree_insert_leaf(radix_tree_t *tree __attribute__((unused)),
+        radix_tree_link_t *link, radix_tree_page_t *page, off_t off)
+{
+       int i;
+
+       if (off + RADIX_TREE_NODE_SIZE > page->size)
+               return;
+
+       for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
+               if (page->owners[i])
+                       continue;
+
+               page->owners[i]   = link;
+               link->u.leaf.page = page;
+               link->u.leaf.buf  = page->buf + off;
+
+               break;
+       }
+}
+
+static char *
+radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       return link->u.leaf.buf;
+
+               if (!link->u.next)
+                       return NULL;
+
+               node = link->u.next;
+       } while (1);
+}
+
+static char *
+radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
+                   radix_tree_page_t *page, off_t off)
+{
+       int idx;
+       struct timeval now;
+       radix_tree_link_t *link;
+       radix_tree_node_t *node;
+
+       node = tree->root;
+       gettimeofday(&now, NULL);
+
+       do {
+               idx        = radix_tree_index(node, sector);
+               link       = node->links + idx;
+               link->time = now.tv_sec;
+
+               if (radix_tree_node_contains_leaves(tree, node)) {
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+                       radix_tree_insert_leaf(tree, link, page, off);
+                       return link->u.leaf.buf;
+               }
+
+               if (!link->u.next) {
+                       link->u.next = radix_tree_allocate_child_node(tree,
+                                                                     node);
+                       if (!link->u.next)
+                               return NULL;
+               }
+
+               node = link->u.next;
+       } while (1);
+}
+
+static int
+radix_tree_add_leaves(radix_tree_t *tree, char *buf,
+                     uint64_t sector, uint64_t sectors)
+{
+       int i;
+       radix_tree_page_t *page;
+
+       page = radix_tree_allocate_page(tree, buf, sector,
+                                       sectors << RADIX_TREE_NODE_SHIFT);
+       if (!page)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               if (!radix_tree_add_leaf(tree, sector + i, 
+                                        page, (i << RADIX_TREE_NODE_SHIFT)))
+                       goto fail;
+
+       return 0;
+
+fail:
+       page->buf = NULL;
+       radix_tree_remove_page(tree, page);
+       return -ENOMEM;
+}
+
+static void
+radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
+{
+       int i;
+       radix_tree_link_t *link;
+
+       if (!node)
+               return;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       radix_tree_free_node(tree, node);
+}
+
+static inline void
+radix_tree_destroy(radix_tree_t *tree)
+{
+       radix_tree_delete_branch(tree, tree->root);
+       tree->root = NULL;
+}
+
+/*
+ * returns 1 if @node is empty after pruning, 0 otherwise
+ */
+static int
+radix_tree_prune_branch(radix_tree_t *tree,
+                       radix_tree_node_t *node, uint32_t now)
+{
+       int i, empty;
+       radix_tree_link_t *link;
+
+       empty = 1;
+       if (!node)
+               return empty;
+
+       for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
+               link = node->links + i;
+
+               if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
+                       if (radix_tree_node_contains_leaves(tree, node)) {
+                               empty = 0;
+                               continue;
+                       }
+
+                       if (radix_tree_prune_branch(tree, link->u.next, now))
+                               radix_tree_clear_link(link);
+                       else
+                               empty = 0;
+
+                       continue;
+               }
+
+               if (radix_tree_node_contains_leaves(tree, node))
+                       radix_tree_remove_page(tree, link->u.leaf.page);
+               else
+                       radix_tree_delete_branch(tree, link->u.next);
+
+               radix_tree_clear_link(link);
+       }
+
+       if (empty && !radix_tree_node_is_root(tree, node))
+               radix_tree_free_node(tree, node);
+
+       return empty;
+}
+
+/*
+ * walk tree and free any node that has been idle for too long
+ */
+static void
+radix_tree_prune(radix_tree_t *tree)
+{
+       struct timeval now;
+
+       if (!tree->root)
+               return;
+
+       DPRINTF("tree %s has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+
+       gettimeofday(&now, NULL);
+       radix_tree_prune_branch(tree, tree->root, now.tv_sec);
+
+       DPRINTF("tree %s now has %"PRIu64" bytes\n",
+               tree->cache->name, tree->size);
+}
+
+static inline int
+radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
+{
+       tree->height = radix_tree_calculate_height(sectors);
+       tree->root   = radix_tree_allocate_node(tree, tree->height);
+       if (!tree->root)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static inline void
+radix_tree_free(radix_tree_t *tree)
+{
+       radix_tree_destroy(tree);
+}
+
+static void
+block_cache_prune_event(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *private)
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)private;
+       tree  = &cache->tree;
+
+       radix_tree_prune(tree);
+}
+
+static inline block_cache_request_t *
+block_cache_get_request(block_cache_t *cache)
+{
+       if (!cache->requests_free)
+               return NULL;
+
+       return cache->request_free_list[--cache->requests_free];
+}
+
+static inline void
+block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
+{
+       memset(breq, 0, sizeof(block_cache_request_t));
+       cache->request_free_list[cache->requests_free++] = breq;
+}
+
+static int
+block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       int i, err;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       if (!td_flag_test(flags, TD_OPEN_RDONLY))
+               return -EINVAL;
+
+       if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
+               return -EINVAL;
+
+       cache = (block_cache_t *)driver->data;
+       err   = tapdisk_namedup(&cache->name, (char *)name);
+       if (err)
+               return -ENOMEM;
+
+       cache->sectors = driver->info.size;
+
+       tree = &cache->tree;
+       err  = radix_tree_initialize(tree, cache->sectors);
+       if (err)
+               goto fail;
+
+       tree->cache = cache;
+       cache->requests_free = BLOCK_CACHE_REQUESTS;
+       for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
+               cache->request_free_list[i] = cache->requests + i;
+
+       cache->timeout_id = 
tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                                         -1, /* dummy fd */
+                                                         
BLOCK_CACHE_PAGE_IDLETIME << 1,
+                                                         
block_cache_prune_event,
+                                                         cache);
+       if (cache->timeout_id < 0)
+               goto fail;
+
+       DPRINTF("opening cache for %s, sectors: %"PRIu64", "
+               "tree: %p, height: %d\n",
+               cache->name, cache->sectors, tree, tree->height);
+
+       if (mlockall(MCL_CURRENT | MCL_FUTURE))
+               DPRINTF("mlockall failed: %d\n", -errno);
+
+       return 0;
+
+fail:
+       free(cache->name);
+       radix_tree_free(&cache->tree);
+       return err;
+}
+
+static int
+block_cache_close(td_driver_t *driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       radix_tree_t *tree;
+       block_cache_t *cache;
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       DPRINTF("closing cache for %s\n", cache->name);
+
+       tapdisk_server_unregister_event(cache->timeout_id);
+       radix_tree_free(tree);
+       free(cache->name);
+
+       return 0;
+}
+
+static inline uint64_t
+block_cache_hash(block_cache_t *cache __attribute__((unused)), char *buf)
+{
+       int i, n;
+       uint64_t cksm, *data;
+
+       return 0;
+
+       cksm = 0;
+       data = (uint64_t *)buf;
+       n    = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
+
+       for (i = 0; i < n; i++)
+               cksm += data[i];
+
+       return ~cksm;
+}
+
+static void
+block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
+{
+       int i;
+       off_t off;
+
+       cache->stats.hits += treq.secs;
+
+       for (i = 0; i < treq.secs; i++) {
+               DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
+                   cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
+
+               off = i << RADIX_TREE_NODE_SHIFT;
+               memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
+       }
+
+       td_complete_request(treq, 0);
+}
+
+static void
+block_cache_populate_cache(td_request_t clone, int err)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       block_cache_request_t *breq;
+
+       breq        = (block_cache_request_t *)clone.cb_data;
+       cache       = breq->cache;
+       tree        = &cache->tree;
+       breq->secs -= clone.secs;
+       breq->err   = (breq->err ? breq->err : err);
+
+       if (breq->secs)
+               return;
+
+       if (breq->err) {
+               free(breq->buf);
+               goto out;
+       }
+
+       for (i = 0; i < breq->treq.secs; i++) {
+               off_t off = i << RADIX_TREE_NODE_SHIFT;
+               DBG("%s: populating sec 0x%08llx\n",
+                   cache->name, breq->treq.sec + i);
+               memcpy(breq->treq.buf + off,
+                      breq->buf + off, RADIX_TREE_NODE_SIZE);
+       }
+
+       if (radix_tree_add_leaves(tree, breq->buf,
+                                 breq->treq.sec, breq->treq.secs))
+               free(breq->buf);
+
+out:
+       td_complete_request(breq->treq, breq->err);
+       block_cache_put_request(cache, breq);
+}
+
+static void
+block_cache_miss(block_cache_t *cache, td_request_t treq)
+{
+       void *buf;
+       size_t size;
+       td_request_t clone;
+       radix_tree_t *tree;
+       block_cache_request_t *breq;
+
+       DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
+
+       clone = treq;
+       tree  = &cache->tree;
+       size  = treq.secs << RADIX_TREE_NODE_SHIFT;
+
+       cache->stats.misses += treq.secs;
+
+       if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
+               goto out;
+
+       breq = block_cache_get_request(cache);
+       if (!breq)
+               goto out;
+
+       if (posix_memalign(&buf, RADIX_TREE_NODE_SIZE, size)) {
+               block_cache_put_request(cache, breq);
+               goto out;
+       }
+
+       breq->treq    = treq;
+       breq->secs    = treq.secs;
+       breq->err     = 0;
+       breq->buf     = buf;
+       breq->cache   = cache;
+
+       clone.buf     = buf;
+       clone.cb      = block_cache_populate_cache;
+       clone.cb_data = breq;
+
+out:
+       td_forward_request(clone);
+}
+
+static void
+block_cache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int i;
+       radix_tree_t *tree;
+       block_cache_t *cache;
+       char *iov[BLOCK_CACHE_NODES_PER_PAGE];
+
+       cache = (block_cache_t *)driver->data;
+       tree  = &cache->tree;
+
+       cache->stats.reads += treq.secs;
+
+       if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
+               return td_forward_request(treq);
+
+       for (i = 0; i < treq.secs; i++) {
+               iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
+               if (!iov[i])
+                       return block_cache_miss(cache, treq);
+       }
+
+       return block_cache_hit(cache, treq, iov);
+}
+
+static void
+block_cache_queue_write(td_driver_t *driver __attribute__((unused)),
+        td_request_t treq)
+{
+       td_complete_request(treq, -EPERM);
+}
+
+static int
+block_cache_get_parent_id(td_driver_t *driver __attribute__((unused)),
+        td_disk_id_t *id __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static int
+block_cache_validate_parent(td_driver_t *driver,
+                           td_driver_t *pdriver, td_flag_t flags 
__attribute__((unused)))
+{
+       if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
+               return -EINVAL;
+
+       if (strcmp(driver->name, pdriver->name))
+               return -EINVAL;
+
+       return 0;
+}
+
+static void
+block_cache_debug(td_driver_t *driver)
+{
+       block_cache_t *cache;
+       block_cache_stats_t *stats;
+
+       cache = (block_cache_t *)driver->data;
+       stats = &cache->stats;
+
+       WARN("BLOCK CACHE %s\n", cache->name);
+       WARN("reads: %"PRIu64", hits: %"PRIu64", "
+            "misses: %"PRIu64", prunes: %"PRIu64"\n",
+            stats->reads, stats->hits, stats->misses, stats->prunes);
+}
+
+struct tap_disk tapdisk_block_cache = {
+       .disk_type                  = "tapdisk_block_cache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(block_cache_t),
+       .td_open                    = block_cache_open,
+       .td_close                   = block_cache_close,
+       .td_queue_read              = block_cache_queue_read,
+       .td_queue_write             = block_cache_queue_write,
+       .td_get_parent_id           = block_cache_get_parent_id,
+       .td_validate_parent         = block_cache_validate_parent,
+       .td_debug                   = block_cache_debug,
+};
diff --git a/tools/blktap3/drivers/block-lcache.c 
b/tools/blktap3/drivers/block-lcache.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-lcache.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Local persistent cache: write any sectors not found in the leaf back to the 
+ * leaf.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+
+#include "vhd.h"
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#define DEBUG 1
+
+#ifdef DEBUG
+#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
+#else
+#define DBG(_f, _a...) ((void)0)
+#endif
+#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \
+                                   ##_a, __func__, __LINE__)
+#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a)
+#define BUG()           td_panic()
+#define BUG_ON(_cond)   if (unlikely(_cond)) { td_panic(); }
+#define WARN_ON(_p)     if (unlikely(_cond)) { WARN(_cond); }
+
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
+
+#define TD_LCACHE_MAX_REQ               (MAX_REQUESTS*2)
+#define TD_LCACHE_BUFSZ                 (MAX_SEGMENTS_PER_REQ * \
+                                        sysconf(_SC_PAGE_SIZE))
+
+
+typedef struct lcache                   td_lcache_t;
+typedef struct lcache_request           td_lcache_req_t;
+
+struct lcache_request {
+       char                           *buf;
+       int                             err;
+
+       td_request_t                    treq;
+       int                             secs;
+
+       td_vbd_request_t                vreq;
+       struct td_iovec                 iov;
+
+       td_lcache_t                    *cache;
+};
+
+struct lcache {
+       char                           *name;
+
+       td_lcache_req_t                 reqv[TD_LCACHE_MAX_REQ];
+       td_lcache_req_t                *free[TD_LCACHE_MAX_REQ];
+       int                             n_free;
+
+       char                           *buf;
+       size_t                          bufsz;
+
+       int                             wr_en;
+       struct timeval                  ts;
+};
+
+static td_lcache_req_t *
+lcache_alloc_request(td_lcache_t *cache)
+{
+       td_lcache_req_t *req = NULL;
+
+       if (likely(cache->n_free))
+               req = cache->free[--cache->n_free];
+
+       return req;
+}
+
+static void
+lcache_free_request(td_lcache_t *cache, td_lcache_req_t *req)
+{
+       BUG_ON(cache->n_free >= TD_LCACHE_MAX_REQ);
+       cache->free[cache->n_free++] = req;
+}
+
+static void
+lcache_destroy_buffers(td_lcache_t *cache)
+{
+       td_lcache_req_t *req;
+
+       do {
+               req = lcache_alloc_request(cache);
+               if (req)
+                       munmap(req->buf, TD_LCACHE_BUFSZ);
+       } while (req);
+}
+
+static int
+lcache_create_buffers(td_lcache_t *cache)
+{
+       int prot, flags, i, err;
+
+       prot  = PROT_READ|PROT_WRITE;
+       flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_LOCKED;
+
+       cache->n_free = 0;
+
+       for (i = 0; i < TD_LCACHE_MAX_REQ; i++) {
+               td_lcache_req_t *req = &cache->reqv[i];
+
+               req->buf = mmap(NULL, TD_LCACHE_BUFSZ, prot, flags, -1, 0);
+               if (req->buf == MAP_FAILED) {
+                       req->buf = NULL;
+                       err = -errno;
+                       goto fail;
+               }
+
+               lcache_free_request(cache, req);
+       }
+
+       return 0;
+
+fail:
+       EPRINTF("Buffer init failure: %d", err);
+       lcache_destroy_buffers(cache);
+       return err;
+}
+
+static int
+lcache_close(td_driver_t *driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       td_lcache_t *cache = driver->data;
+
+       lcache_destroy_buffers(cache);
+
+       free(cache->name);
+
+       return 0;
+}
+
+static int
+lcache_open(td_driver_t *driver, const char *name,
+        td_flag_t flags __attribute__((unused)))
+{
+       td_lcache_t *cache = driver->data;
+       int err;
+
+       err  = tapdisk_namedup(&cache->name, (char *)name);
+       if (err)
+               goto fail;
+
+       err = lcache_create_buffers(cache);
+       if (err)
+               goto fail;
+
+       timerclear(&cache->ts);
+       cache->wr_en = 1;
+
+       return 0;
+
+fail:
+       lcache_close(driver, NULL);
+       return err;
+}
+
+/*
+ * NB. lcache->{wr_en,ts}: test free space in the caching SR before
+ * attempting to store our reads. VHD block allocation writes on Ext3
+ * have the nasty property of blocking excessively after running out
+ * of space. We therefore enable/disable ourselves at a 1/s
+ * granularity, querying free space through statfs beforehand.
+ */
+
+static long
+lcache_fs_bfree(const td_lcache_t *cache, long *bsize)
+{
+       struct statfs fst;
+       int err;
+
+       err = statfs(cache->name, &fst);
+       if (err)
+               return err;
+
+       if (likely(bsize))
+               *bsize = fst.f_bsize;
+
+       return MIN(fst.f_bfree, LONG_MAX);
+}
+
+static int
+__lcache_wr_enabled(const td_lcache_t *cache)
+{
+       long threshold = 2<<20; /* B */
+       long bfree, bsz = 1;
+       int enable;
+
+       bfree  = lcache_fs_bfree(cache, &bsz);
+       enable = bfree > threshold / bsz;
+
+       return enable;
+}
+
+static int
+lcache_wr_enabled(td_lcache_t *cache)
+{
+       const int timeout = 1; /* s */
+       struct timeval now, delta;
+
+       gettimeofday(&now, NULL);
+       timersub(&now, &cache->ts, &delta);
+
+       if (delta.tv_sec >= timeout) {
+               cache->wr_en = __lcache_wr_enabled(cache);
+               cache->ts    = now;
+       }
+
+       return cache->wr_en;
+}
+
+static void
+__lcache_write_cb(td_vbd_request_t *vreq, int error,
+                 void *token, int final __attribute__((unused)))
+{
+       td_lcache_req_t *req = containerof(vreq, td_lcache_req_t, vreq);
+       td_lcache_t *cache = token;
+
+       if (error == -ENOSPC)
+               cache->wr_en = 0;
+
+       lcache_free_request(cache, req);
+}
+
+static void
+lcache_store_read(td_lcache_t *cache, td_lcache_req_t *req)
+{
+       td_vbd_request_t *vreq;
+       struct td_iovec *iov;
+       td_vbd_t *vbd;
+       int err;
+
+       iov          = &req->iov;
+       iov->base    = req->buf;
+       iov->secs    = req->treq.secs;
+
+       vreq         = &req->vreq;
+       vreq->op     = TD_OP_WRITE;
+       vreq->sec    = req->treq.sec;
+       vreq->iov    = iov;
+       vreq->iovcnt = 1;
+       vreq->cb     = __lcache_write_cb;
+       vreq->token  = cache;
+
+       vbd = req->treq.vreq->vbd;
+
+       err = tapdisk_vbd_queue_request(vbd, vreq);
+       BUG_ON(err);
+}
+
+static void
+lcache_complete_read(td_lcache_t *cache, td_lcache_req_t *req)
+{
+       if (likely(!req->err)) {
+               size_t sz = req->treq.secs << SECTOR_SHIFT;
+               memcpy(req->treq.buf, req->buf, sz);
+       }
+
+       td_complete_request(req->treq, req->err);
+
+       if (unlikely(req->err) || !lcache_wr_enabled(cache)) {
+               lcache_free_request(cache, req);
+               return;
+       }
+
+       lcache_store_read(cache, req);
+}
+
+static void
+__lcache_read_cb(td_request_t treq, int err)
+{
+       td_lcache_req_t *req = treq.cb_data;
+       td_lcache_t *cache = req->cache;
+
+       BUG_ON(req->secs < treq.secs);
+       req->secs -= treq.secs;
+       req->err   = req->err ? : err;
+
+       if (!req->secs)
+               lcache_complete_read(cache, req);
+}
+
+static void
+lcache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       td_lcache_t *cache = driver->data;
+       td_request_t clone;
+       td_lcache_req_t *req;
+
+       req = lcache_alloc_request(cache);
+       if (!req) {
+               td_complete_request(treq, -EBUSY);
+               return;
+       }
+
+       req->treq    = treq;
+       req->cache   = cache;
+
+       req->secs    = req->treq.secs;
+       req->err     = 0;
+
+       clone         = treq;
+       clone.buf     = req->buf;
+       clone.cb      = __lcache_read_cb;
+       clone.cb_data = req;
+
+       td_forward_request(clone);
+}
+
+static int
+lcache_get_parent_id(td_driver_t *driver __attribute__((unused)),
+        td_disk_id_t *id __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static int
+lcache_validate_parent(td_driver_t *driver,
+                      td_driver_t *pdriver, td_flag_t flags 
__attribute__((unused)))
+{
+       if (strcmp(driver->name, pdriver->name))
+               return -EINVAL;
+
+       return 0;
+}
+
+struct tap_disk tapdisk_lcache = {
+       .disk_type                  = "tapdisk_lcache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_lcache_t),
+       .td_open                    = lcache_open,
+       .td_close                   = lcache_close,
+       .td_queue_read              = lcache_queue_read,
+       .td_get_parent_id           = lcache_get_parent_id,
+       .td_validate_parent         = lcache_validate_parent,
+};
diff --git a/tools/blktap3/drivers/block-llcache.c 
b/tools/blktap3/drivers/block-llcache.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-llcache.c
@@ -0,0 +1,610 @@
+/*
+ * Copyright (c) 2010, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+
+#include "tapdisk.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-disktype.h"
+
+#define DBG(_f, _a...)  tlog_syslog(TLOG_DBG, _f, ##_a)
+#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a)
+#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \
+                                   ##_a, __func__, __LINE__)
+
+#define BUG()           td_panic()
+#define BUG_ON(_cond)   if (unlikely(_cond)) { td_panic(); }
+#define WARN_ON(_p)     if (unlikely(_cond)) { WARN(_cond); }
+
+int ll_write_error(int curr, int error)
+{
+       if (error && (!curr || curr == -ENOSPC))
+               return error;
+
+       return 0;
+}
+
+void ll_log_switch(int type __attribute__((unused)), int error,
+                  td_image_t *local, td_image_t *shared)
+{
+       WARN("WARNING: %s, on %s:%s. Switching to %s:%s.",
+            strerror(-error),
+            tapdisk_disk_types[local->type]->name, local->name,
+            tapdisk_disk_types[shared->type]->name, shared->name);
+}
+
+/*
+ * LLP: Local leaf persistent cache
+ *      -- Persistent write caching in local storage.
+ *
+ *    VBD
+ *      \
+ *       +--r/w--> llp+vhd:/local/leaf
+ *        \
+ *         +--r/w--> vhd:/shared/leaf
+ *          \
+ *           +--r/o--> vhd:/shared/parent
+ *
+ * We drive two 'leaf' (r/w) images: One LOCAL (i.e. on local storage,
+ * unreliable and prone to out-of-space failures), and one SHARED
+ * (i.e. in shared storage with plenty of physical backing).
+ *
+ * All images are on a linear read chain: LOCAL inherits from SHARED,
+ * which inherits from a shared master image. This filter driver
+ * aggregates LOCAL. SHARED is our immediate parent, forced into R/W
+ * mode.
+ *
+ * Unless LOCAL failed, reads are issued to LOCAL, to save shared
+ * storage bandwidth. In case of failure, SHARED provides continued
+ * VDI consistency.
+ *
+ */
+enum {
+       LLP_MIRROR = 1,
+       /*
+        * LLP_MIRROR:
+        *
+        * Writes are mirrored to both LOCAL and SHARED. Reads are
+        * issued to LOCAL.
+        *
+        * Failure to write LOCAL are recoverable. The driver will
+        * transition to LLP_SHARED.
+        *
+        * Failure to write SHARED is irrecoverable, and signaled to
+        * the original issuer.
+        */
+
+       LLP_SHARED = 2,
+       /*
+        * LLP_SHARED:
+        *
+        * Writes are issued to SHARED only. As are reads.
+        *
+        * Failure to write SHARED is irrecoverable.
+        */
+};
+
+typedef struct llpcache                 td_llpcache_t;
+typedef struct llpcache_request         td_llpcache_req_t;
+#define TD_LLPCACHE_MAX_REQ             (MAX_REQUESTS*2)
+
+struct llpcache_vreq {
+       enum { LOCAL = 0, SHARED = 1 }  target;
+       td_vbd_request_t                vreq;
+};
+
+struct llpcache_request {
+       td_request_t            treq;
+
+       struct td_iovec         iov;
+       int                     error;
+
+       struct llpcache_vreq    lvr[2];
+
+       unsigned int            pending;
+       int                     mode;
+};
+
+struct llpcache {
+       td_image_t             *local;
+       int                     mode;
+
+       td_llpcache_req_t       reqv[TD_LLPCACHE_MAX_REQ];
+       td_llpcache_req_t      *free[TD_LLPCACHE_MAX_REQ];
+       int                     n_free;
+};
+
+static td_llpcache_req_t *
+llpcache_alloc_request(td_llpcache_t *s)
+{
+       td_llpcache_req_t *req = NULL;
+
+       if (likely(s->n_free))
+               req = s->free[--s->n_free];
+
+       return req;
+}
+
+static void
+llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req)
+{
+       BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ);
+       s->free[s->n_free++] = req;
+}
+
+static void
+__llpcache_write_cb(td_vbd_request_t *vreq, int error,
+                  void *token, int final __attribute__((unused)))
+{
+       td_llpcache_t *s = token;
+       struct llpcache_vreq *lvr;
+       td_llpcache_req_t *req;
+       int mask;
+
+       lvr = containerof(vreq, struct llpcache_vreq, vreq);
+       req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]);
+
+       mask = 1U << lvr->target;
+       BUG_ON(!(req->pending & mask))
+
+       if (lvr->target == LOCAL && error == -ENOSPC) {
+               td_image_t *shared = TAILQ_NEXT(req->treq.image, entry);
+               ll_log_switch(DISK_TYPE_LLPCACHE, error,
+                             s->local, shared);
+               s->mode = LLP_SHARED;
+               error = 0;
+       }
+
+       req->pending &= ~mask;
+       req->error    = ll_write_error(req->error, error);
+
+       if (!req->pending) {
+               /* FIXME: Make sure this won't retry. */
+               td_complete_request(req->treq, req->error);
+               llpcache_free_request(s, req);
+       }
+}
+
+/*
+ * NB. Write mirroring. Lacking per-image queues, it's still a
+ * hack. But shall do for now:
+ *
+ *   1. Store the treq, thereby blocking the original vreq.
+ *   2. Reissue, as two clone vreqs. One local, one shared.
+ *   3. Clones seen again then get forwarded.
+ *   4. Treq completes after both vreqs.
+ *
+ * We can recognize clones by matching the vreq->token field.
+ */
+
+static int
+llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target)
+{
+       struct llpcache_vreq *lvr;
+       td_vbd_request_t *vreq;
+       int err;
+
+       lvr           = &req->lvr[target];
+       lvr->target   = target;
+
+       vreq          = &lvr->vreq;
+       vreq->op      = TD_OP_WRITE;
+       vreq->sec     = req->treq.sec;
+       vreq->iov     = &req->iov;
+       vreq->iovcnt  = 1;
+       vreq->cb      = __llpcache_write_cb;
+       vreq->token   = s;
+
+       err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq);
+       if (err)
+               goto fail;
+
+       req->pending |= 1UL << target;
+       return 0;
+
+fail:
+       req->error   = req->error ? : err;
+       return err;
+}
+
+static void
+llpcache_fork_write(td_llpcache_t *s, td_request_t treq)
+{
+       td_llpcache_req_t *req;
+       struct td_iovec *iov;
+       int err;
+
+       req = llpcache_alloc_request(s);
+       if (!req) {
+               td_complete_request(treq, -EBUSY);
+               return;
+       }
+
+       memset(req, 0, sizeof(req));
+
+       req->treq     = treq;
+
+       iov           = &req->iov;
+       iov->base     = treq.buf;
+       iov->secs     = treq.secs;
+
+       err = llpcache_requeue_treq(s, req, LOCAL);
+       if (err)
+               goto fail;
+
+       err = llpcache_requeue_treq(s, req, SHARED);
+       if (err)
+               goto fail;
+
+       return;
+
+fail:
+       if (!req->pending) {
+               td_complete_request(treq, req->error);
+               llpcache_free_request(s, req);
+       }
+}
+
+static void
+llpcache_forward_write(td_llpcache_t *s, td_request_t treq)
+{
+       const td_vbd_request_t *vreq = treq.vreq;
+       struct llpcache_vreq *lvr;
+
+       lvr = containerof(vreq, struct llpcache_vreq, vreq);
+
+       switch (lvr->target) {
+       case SHARED:
+               td_forward_request(treq);
+               break;
+       case LOCAL:
+               td_queue_write(s->local, treq);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static void
+llpcache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_llpcache_t *s = driver->data;
+
+       if (treq.vreq->token == s)
+               llpcache_forward_write(s, treq);
+       else
+               llpcache_fork_write(s, treq);
+}
+
+static void
+llpcache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       td_llpcache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLP_MIRROR:
+               td_queue_read(s->local, treq);
+               break;
+       case LLP_SHARED:
+               td_forward_request(treq);
+       default:
+               BUG();
+       }
+}
+
+static int
+llpcache_close(td_driver_t *driver, struct tqh_td_image_handle *head)
+{
+       td_llpcache_t *s = driver->data;
+
+       if (s->local) {
+               tapdisk_image_close(s->local, head);
+               s->local = NULL;
+       }
+
+       return 0;
+}
+
+static int
+llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       td_llpcache_t *s = driver->data;
+       int i, err;
+
+       s->mode = LLP_MIRROR;
+
+       for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++)
+               llpcache_free_request(s, &s->reqv[i]);
+
+       err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local);
+       if (err)
+               goto fail;
+
+       driver->info = s->local->driver->info;
+
+       return 0;
+
+fail:
+       llpcache_close(driver, NULL);
+       return err;
+}
+
+static int
+llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+       td_llpcache_t *s = driver->data;
+       int err;
+
+       err = td_get_parent_id(s->local, id);
+       if (!err)
+               id->flags &= ~TD_OPEN_RDONLY;
+
+       return err;
+}
+
+static int
+llcache_validate_parent(td_driver_t *driver __attribute__((unused)),
+                       td_driver_t *pdriver __attribute__((unused)),
+            td_flag_t flags __attribute__((unused)))
+{
+       return -ENOSYS;
+}
+
+
+struct tap_disk tapdisk_llpcache = {
+       .disk_type                  = "tapdisk_llpcache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_llpcache_t),
+       .td_open                    = llpcache_open,
+       .td_close                   = llpcache_close,
+       .td_queue_read              = llpcache_queue_read,
+       .td_queue_write             = llpcache_queue_write,
+       .td_get_parent_id           = llcache_get_parent_id,
+       .td_validate_parent         = llcache_validate_parent,
+};
+
+/*
+ * LLE: Local Leaf Ephemeral Cache
+ *      -- Non-persistent write caching in local storage.
+ *
+ *    VBD
+ *      \
+ *       +--r/w--> lle+vhd:/shared/leaf
+ *        \
+ *         +--r/w--> vhd:/local/leaf
+ *          \
+ *           +--r/o--> vhd:/shared/parent
+ *
+ * Note that LOCAL and SHARED chain order differs from LLP. Shared
+ * storage data masks local data.
+ *
+ * This means VDI state in shared storage state alone is
+ * inconsistent. Wherever local is unavailable, SHARED must be
+ * discarded too.
+ */
+enum {
+       LLE_LOCAL = 1,
+       /*
+        * LLE_LOCAL:
+        *
+        * Writes are forwarded to LOCAL only. As are reads. This
+        * reduces network overhead.
+        *
+        * Failure to write LOCAL is recoverable. The driver will
+        * transition to LLE_SHARED.
+        *
+        * Failure to write to shared are irrecoverable and signaled
+        * to the original issuer.
+        */
+
+       LLE_SHARED = 2,
+       /*
+        * LLE_SHARED:
+        *
+        * Writes are issued to SHARED. As are reads.
+        *
+        * Failure to write to SHARED is irrecoverable.
+        */
+};
+
+typedef struct llecache                 td_llecache_t;
+typedef struct llecache_request         td_llecache_req_t;
+#define TD_LLECACHE_MAX_REQ             (MAX_REQUESTS*2)
+
+struct llecache_request {
+       td_llecache_t          *s;
+       td_request_t            treq;
+       int                     pending;
+       int                     error;
+};
+
+struct llecache {
+       td_image_t             *shared;
+       int                     mode;
+
+       td_llecache_req_t       reqv[TD_LLECACHE_MAX_REQ];
+       td_llecache_req_t      *free[TD_LLECACHE_MAX_REQ];
+       int                     n_free;
+};
+
+static td_llecache_req_t *
+llecache_alloc_request(td_llecache_t *s)
+{
+       td_llecache_req_t *req = NULL;
+
+       if (likely(s->n_free))
+               req = s->free[--s->n_free];
+
+       return req;
+}
+
+static void
+llecache_free_request(td_llecache_t *s, td_llecache_req_t *req)
+{
+       BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ);
+       s->free[s->n_free++] = req;
+}
+
+static int
+llecache_close(td_driver_t *driver, struct tqh_td_image_handle *head)
+{
+       td_llecache_t *s = driver->data;
+
+       if (s->shared) {
+               tapdisk_image_close(s->shared, head);
+               s->shared = NULL;
+       }
+
+       return 0;
+}
+
+static int
+llecache_open(td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       td_llecache_t *s = driver->data;
+       int i, err;
+
+       s->mode = LLE_LOCAL;
+
+       for (i = 0; i < TD_LLECACHE_MAX_REQ; i++)
+               llecache_free_request(s, &s->reqv[i]);
+
+       err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared);
+       if (err)
+               goto fail;
+
+       driver->info = s->shared->driver->info;
+
+       return 0;
+
+fail:
+       llecache_close(driver, NULL);
+       return err;
+}
+
+static void
+__llecache_write_cb(td_request_t treq, int error)
+{
+       td_llecache_req_t *req = treq.cb_data;
+       td_llecache_t *s = req->s;
+
+       BUG_ON(req->pending < treq.secs);
+
+       req->pending -= treq.secs;
+       req->error    = ll_write_error(req->error, error);
+
+       if (req->pending)
+               return;
+
+       if (req->error == -ENOSPC) {
+               ll_log_switch(DISK_TYPE_LLECACHE, req->error,
+                             treq.image, s->shared);
+
+               s->mode = LLE_SHARED;
+               td_queue_write(s->shared, req->treq);
+
+       } else
+               td_complete_request(req->treq, error);
+
+       llecache_free_request(s, req);
+}
+
+static void
+llecache_forward_write(td_llecache_t *s, td_request_t treq)
+{
+       td_llecache_req_t *req;
+       td_request_t clone;
+
+       req = llecache_alloc_request(s);
+       if (!req) {
+               td_complete_request(treq, -EBUSY);
+               return;
+       }
+
+       memset(req, 0, sizeof(req));
+
+       req->treq       = treq;
+       req->pending    = treq.secs;
+       req->s          = s;
+
+       clone           = treq;
+       clone.cb        = __llecache_write_cb;
+       clone.cb_data   = req;
+
+       td_forward_request(clone);
+}
+
+static void
+llecache_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       td_llecache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLE_LOCAL:
+               llecache_forward_write(s, treq);
+               break;
+       case LLE_SHARED:
+               td_queue_write(s->shared, treq);
+               break;
+       }
+}
+
+static void
+llecache_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       td_llecache_t *s = driver->data;
+
+       switch (s->mode) {
+       case LLE_LOCAL:
+               td_forward_request(treq);
+               break;
+       case LLE_SHARED:
+               td_queue_read(s->shared, treq);
+               break;
+       default:
+               BUG();
+       }
+}
+
+struct tap_disk tapdisk_llecache = {
+       .disk_type                  = "tapdisk_llecache",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_llecache_t),
+       .td_open                    = llecache_open,
+       .td_close                   = llecache_close,
+       .td_queue_read              = llecache_queue_read,
+       .td_queue_write             = llecache_queue_write,
+       .td_get_parent_id           = llcache_get_parent_id,
+       .td_validate_parent         = llcache_validate_parent,
+};
diff --git a/tools/blktap3/drivers/block-log.c 
b/tools/blktap3/drivers/block-log.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-log.c
@@ -0,0 +1,692 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver to sit on top of another disk and log writes, in order
+ * to synchronize two distinct disks
+ *
+ * On receipt of a control request it can export a list of dirty
+ * sectors in the following format:
+ * struct writerange {
+ *   u64 sector;
+ *   u32 count;
+ * }
+ * terminated by { 0, 0 }
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "log.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+#define MAX_CONNECTIONS 1
+
+typedef struct poll_fd {
+  int          fd;
+  event_id_t   id;
+} poll_fd_t;
+
+struct tdlog_state {
+  uint64_t     size;
+
+  void*        writelog;
+
+  char*        ctlpath;
+  poll_fd_t    ctl;
+
+  int          connected;
+  poll_fd_t    connections[MAX_CONNECTIONS];
+
+  char*        shmpath;
+  void*        shm;
+
+  log_sring_t* sring;
+  log_back_ring_t bring;
+};
+
+#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
+
+#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
+
+static void ctl_accept(event_id_t, char, void *);
+static void ctl_request(event_id_t, char, void *);
+
+/* -- write log -- */
+
+/* large flat bitmaps don't scale particularly well either in size or scan
+ * time, but they'll do for now */
+#define BITS_PER_LONG (sizeof(unsigned long) * 8)
+#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+
+#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG]
+#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
+
+static inline int test_bit(int nr, void* bmap)
+{
+  return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1;
+}
+
+static inline void clear_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr));
+}
+
+static inline void set_bit(int nr, void* bmap)
+{
+  BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr));
+}
+
+static inline int bitmap_size(uint64_t sz)
+{
+  return sz >> 3;
+}
+
+static int writelog_create(struct tdlog_state *s)
+{
+  uint64_t bmsize;
+
+  bmsize = bitmap_size(s->size);
+
+  BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
+
+  if (!(s->writelog = calloc(bmsize, 1))) {
+    BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int writelog_free(struct tdlog_state *s)
+{
+  if (s->writelog)
+    free(s->writelog);
+
+  return 0;
+}
+
+static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
+{
+  int i;
+
+  for (i = 0; i < count; i++) 
+    set_bit(sector + i, s->writelog);
+
+  return 0;
+}
+
+/* if end is 0, clear to end of disk */
+int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
+{
+  if (!end)
+    end = s->size;
+
+  /* clear to word boundaries */
+  while (BITMAP_SHIFT(start))
+    clear_bit(start++, s->writelog);
+  while (BITMAP_SHIFT(end))
+    clear_bit(end--, s->writelog);
+
+  memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
+
+  return 0;
+}
+
+/* returns last block exported (may not be end of disk if shm region
+ * overflows) */
+static uint64_t writelog_export(struct tdlog_state* s)
+{
+  struct disk_range* range = s->shm;
+  uint64_t i = 0;
+
+  BDPRINTF("sector count: %"PRIu64, s->size);
+
+  for (i = 0; i < s->size; i++) {
+    if (test_bit(i, s->writelog)) {
+      /* range start */
+      range->sector = i;
+      range->count = 1;
+      /* find end */
+      for (i++; i < s->size && test_bit(i, s->writelog); i++)
+       range->count++;
+
+      BDPRINTF("export: dirty extent %"PRIu64":%u",
+              range->sector, range->count);
+      range++;
+
+      /* out of space in shared memory region */
+      if ((void*)range >= bmend(s->shm)) {
+       BDPRINTF("out of space in shm region at sector %"PRIu64, i);
+       return i;
+      }
+
+      /* undo forloop increment */
+      i--;
+    }
+  }
+
+  /* NULL-terminate range list */
+  range->sector = 0;
+  range->count = 0;
+
+  return i;
+}
+
+/* -- communication channel -- */
+
+/* remove FS special characters in up to len bytes of path */
+static inline void path_escape(char* path, size_t len) {
+  int i;
+
+  for (i = 0; i < len && path[i]; i++)
+    if (strchr(":/", path[i]))
+      path[i] = '_';
+}
+
+static char* ctl_makepath(const char* name, const char* ext)
+{
+  char* res;
+  char *file;
+
+  file = strrchr(name, '/');
+  if (!file) {
+    BWPRINTF("invalid name %s\n", name);
+    return NULL;
+  }
+
+  if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
+    BWPRINTF("could not allocate path");
+    return NULL;
+  }
+
+  path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
+
+  return res;
+}
+
+static int shmem_open(struct tdlog_state* s, const char* name)
+{
+  int i, l, fd;
+
+  /* device name -> path */
+  if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
+    BWPRINTF("could not allocate shm path");
+    return -1;
+  }
+
+  path_escape(s->shmpath + 5, strlen(name));
+
+  if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
+    BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
+            strerror(errno));
+    goto err;
+  }
+  if (ftruncate(fd, SHMSIZE) < 0) {
+    BWPRINTF("error truncating shmem to size %u", SHMSIZE);
+    close(fd);
+    goto err;
+  }
+
+  s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+  close(fd);
+  if (s->shm == MAP_FAILED) {
+    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
+    goto err;
+  }
+  return 0;
+
+  err:
+  s->shm = NULL;
+  free(s->shmpath);
+  s->shmpath = NULL;
+  return -1;
+}
+
+static int shmem_close(struct tdlog_state* s)
+{
+  if (s->shm) {
+    munmap(s->shm, SHMSIZE);
+    s->shm = NULL;
+  }
+
+  if (s->shmpath) {
+    shm_unlink(s->shmpath);
+    s->shmpath = NULL;
+  }
+
+  return 0;
+}
+
+/* control socket */
+
+static int ctl_open(struct tdlog_state* s, const char* name)
+{
+  struct sockaddr_un saddr;
+
+  if (!(s->ctlpath = ctl_makepath(name, "ctl")))
+    return -1;
+
+  if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+    BWPRINTF("error opening control socket: %s", strerror(errno));
+    goto err;
+  }
+
+  memset(&saddr, 0, sizeof(saddr));
+  saddr.sun_family = AF_UNIX;
+  memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
+  if (unlink(s->ctlpath) && errno != ENOENT) {
+    BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+    
+  if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) {
+    BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
+            strerror(errno));
+    goto err_sock;
+  }
+
+  if (listen(s->ctl.fd, 1) < 0) {
+    BWPRINTF("error listening on control socket: %s", strerror(errno));
+    goto err_sock;
+  }
+
+  s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           s->ctl.fd, 0, ctl_accept, s);
+  if (s->ctl.id < 0) {
+    BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
+    goto err_sock;
+  }
+
+  return 0;
+
+  err_sock:
+  close(s->ctl.fd);
+  s->ctl.fd = -1;
+  err:
+  free(s->ctlpath);
+  s->ctlpath = NULL;
+
+  return -1;
+}
+
+static int ctl_close(struct tdlog_state* s)
+{
+  while (s->connected) {
+    tapdisk_server_unregister_event(s->connections[s->connected].id);
+    close(s->connections[s->connected].fd);
+    s->connections[s->connected].fd = -1;
+    s->connections[s->connected].id = 0;
+    s->connected--;
+  }
+
+  if (s->ctl.fd >= 0) {
+    tapdisk_server_unregister_event(s->ctl.id);
+    close(s->ctl.fd);
+    s->ctl.fd = -1;
+    s->ctl.id = 0;
+  }
+
+  if (s->ctlpath) {
+    unlink(s->ctlpath);
+    free(s->ctlpath);
+    s->ctlpath = NULL;
+  }
+
+  /* XXX this must be fixed once requests are actually in flight */
+  /* could just drain the existing ring here first */
+  if (s->sring) {
+    SHARED_RING_INIT(s->sring);
+    BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+  }
+
+  return 0;
+}
+
+/* walk list of open sockets, close matching fd */
+static int ctl_close_sock(struct tdlog_state* s, int fd)
+{
+  int i;
+
+  for (i = 0; i <= s->connected; i++) {
+    if (s->connections[i].fd == fd) {
+      tapdisk_server_unregister_event(s->connections[i].id);
+      close(s->connections[i].fd);
+      s->connections[i].fd = -1;
+      s->connections[i].id = 0;
+      s->connected--;
+      return 0;
+    }
+  }
+
+  BWPRINTF("requested to close unknown socket %d", fd);
+  return -1;
+}
+
+static void ctl_accept(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state *)private;
+  int fd;
+  event_id_t cid;
+
+  if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
+    BWPRINTF("error accepting control connection: %s", strerror(errno));
+    return;
+  }
+
+  if (s->connected) {
+    BWPRINTF("control session in progress, closing new connection");
+    close(fd);
+    return;
+  }
+
+  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                     fd, 0, ctl_request, s);
+  if (cid < 0) {
+    BWPRINTF("error registering connection event handler: %s", strerror(cid));
+    close(fd);
+    return;
+  }
+
+  s->connections[s->connected].fd = fd;
+  s->connections[s->connected].id = cid;
+  s->connected++;
+}
+
+/* response format: 4 bytes shmsize, 0-terminated path */
+static int ctl_get_shmpath(struct tdlog_state* s, int fd)
+{
+  char msg[CTLRSPLEN_SHMP + 1];
+  uint32_t sz;
+  int rc;
+
+  BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
+          SHMSIZE, s->shmpath);
+
+  /* TMP: sanity-check shm */
+  sz = 0xdeadbeef;
+  memcpy(s->shm, &sz, sizeof(sz));
+
+  sz = SHMSIZE;
+  memcpy(msg, &sz, sizeof(sz));
+  snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
+  if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
+    BWPRINTF("error writing shmpath: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_peek_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: peeking bitmap");
+
+  writelog_export(s);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
+    BWPRINTF("error writing peek ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_clear_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: clearing bitmap");
+
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
+    BWPRINTF("error writing clear ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get dirty bitmap and clear it atomically */
+static int ctl_get_writes(struct tdlog_state* s, int fd)
+{
+  int rc;
+
+  BDPRINTF("ctl: getting bitmap");
+
+  writelog_export(s);
+  writelog_clear(s, 0, 0);
+
+  if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
+    BWPRINTF("error writing get ack: %s", strerror(errno));
+    return -1;
+  }
+
+  return 0;
+}
+
+/* get requests from ring */
+static int ctl_kick(struct tdlog_state* s, int fd)
+{
+  RING_IDX reqstart, reqend;
+  log_request_t req;
+
+  /* XXX testing */
+  RING_IDX rspstart, rspend;
+  log_response_t rsp;
+  struct log_ctlmsg msg;
+  int rc;
+
+  reqstart = s->bring.req_cons;
+  reqend = s->sring->req_prod;
+
+  BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
+
+  while (reqstart != reqend) {
+    /* XXX actually submit these! */
+    memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
+    BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
+    s->bring.req_cons = ++reqstart;
+
+    rsp.sector = req.sector;
+    rsp.count = req.count;
+    memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
+          sizeof(rsp));
+    s->bring.rsp_prod_pvt++;
+  }
+
+  RING_PUSH_RESPONSES(&s->bring);
+  memset(&msg, 0, sizeof(msg));
+  memcpy(msg.msg, LOGCMD_KICK, 4);
+  if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error sending notify: %s", strerror(errno));
+    return -1;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
+    return -1;
+  }
+
+  return 0;
+}
+
+static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* 
msg)
+{
+  if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
+    return ctl_get_shmpath(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
+    return ctl_peek_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
+    return ctl_clear_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
+    return ctl_get_writes(s, fd);
+  } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
+    return ctl_kick(s, fd);
+  }
+
+  BWPRINTF("unknown control request %.4s", msg->msg);
+  return -1;
+}
+
+static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
+{
+  int i;
+
+  for (i = 0; i < s->connected; i++)
+    if (s->connections[i].id == id)
+      return s->connections[i].fd;
+
+  BWPRINTF("unrecognized event callback id %d", id);
+  return -1;
+}
+
+static void ctl_request(event_id_t id, char mode, void *private)
+{
+  struct tdlog_state* s = (struct tdlog_state*)private;
+  struct log_ctlmsg msg;
+  int rc, i, fd = -1;
+
+  fd = ctl_find_connection(s, id);
+  if (fd == -1)
+    return;
+
+  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
+    BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
+            strerror(errno));
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc == 0) {
+    BDPRINTF("ctl_request: EOF, closing socket");
+    ctl_close_sock(s, fd);
+    return;
+  } else if (rc < sizeof(msg)) {
+    BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
+            sizeof(msg));
+    return;
+  }
+
+  ctl_do_request(s, fd, &msg);
+}
+
+/* -- interface -- */
+
+static int tdlog_close(td_driver_t*);
+
+static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  memset(s, 0, sizeof(*s));
+
+  s->size = driver->info.size;
+
+  if ((rc = writelog_create(s))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = shmem_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+  if ((rc = ctl_open(s, name))) {
+    tdlog_close(driver);
+    return rc;
+  }
+
+  s->sring = (log_sring_t*)sringstart(s->shm);
+  SHARED_RING_INIT(s->sring);
+  BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
+
+  BDPRINTF("opened ctl socket");
+
+  return 0;
+}
+
+static int tdlog_close(td_driver_t* driver)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+
+  ctl_close(s);
+  shmem_close(s);
+  writelog_free(s);
+
+  return 0;
+}
+
+static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
+{
+  td_forward_request(treq);
+}
+
+static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
+{
+  struct tdlog_state* s = (struct tdlog_state*)driver->data;
+  int rc;
+
+  writelog_set(s, treq.sec, treq.secs);
+  td_forward_request(treq);
+}
+
+static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
+{
+  return -EINVAL;
+}
+
+static int tdlog_validate_parent(td_driver_t *driver,
+                                td_driver_t *parent, td_flag_t flags)
+{
+  return 0;
+}
+
+struct tap_disk tapdisk_log = {
+  .disk_type          = "tapdisk_log",
+  .private_data_size  = sizeof(struct tdlog_state),
+  .flags              = 0,
+  .td_open            = tdlog_open,
+  .td_close           = tdlog_close,
+  .td_queue_read      = tdlog_queue_read,
+  .td_queue_write     = tdlog_queue_write,
+  .td_get_parent_id   = tdlog_get_parent_id,
+  .td_validate_parent = tdlog_validate_parent,
+};
diff --git a/tools/blktap3/drivers/block-nbd.c 
b/tools/blktap3/drivers/block-nbd.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-nbd.c
@@ -0,0 +1,908 @@
+/*
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <netinet/tcp.h>
+#include <netinet/in.h>
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-fdreceiver.h"
+#include "tapdisk-nbd.h"
+
+#define INFO(_f, _a...)            tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a)
+#define ERROR(_f, _a...)           tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a)
+
+#define N_PASSED_FDS 10
+#define TAPDISK_NBDCLIENT_MAX_PATH_LEN 256
+#define TAPDISK_NBDCLIENT_LISTEN_SOCK_PATH "/var/run/blktap-control/nbdclient"
+#define MAX_NBD_REQS TAPDISK_DATA_REQUESTS
+#define NBD_TIMEOUT 30
+
+/*
+ * We'll only ever have one nbdclient fd receiver per tapdisk process, so 
let's 
+ * just store it here globally. We'll also keep track of the passed fds here
+ * too.
+ */
+
+struct td_fdreceiver *fdreceiver = NULL;
+
+struct tdnbd_passed_fd {
+       char                    id[40];
+       struct                  timeval t;
+       int                     fd;
+} passed_fds[N_PASSED_FDS];
+
+struct nbd_queued_io {
+       char                   *buffer;
+       int                     len;
+       int                     so_far;
+};
+
+/*
+ * this creates "struct tqh_td_nbd_request"
+ */
+TAILQ_HEAD(tqh_td_nbd_request, td_nbd_request);
+
+struct td_nbd_request {
+       td_request_t            treq;
+       struct nbd_request      nreq;
+       int                     timeout_event;
+       int                     fake;
+       struct nbd_queued_io    header;
+       struct nbd_queued_io    body;     /* in or out, depending on whether
+                                            type is read or write. */
+       TAILQ_ENTRY(td_nbd_request) queue; /* TODO rename to entry */
+};
+
+struct tdnbd_data
+{
+       int                     writer_event_id;
+       struct tqh_td_nbd_request        sent_reqs;
+       struct tqh_td_nbd_request        pending_reqs;
+       struct tqh_td_nbd_request        free_reqs;
+       struct td_nbd_request   requests[MAX_NBD_REQS];
+       int                     nr_free_count;
+
+       int                     reader_event_id;
+       struct nbd_reply        current_reply;
+       struct nbd_queued_io    cur_reply_qio;
+       struct td_nbd_request  *curr_reply_req;
+
+       int                     socket;
+       struct sockaddr_in     *remote;
+       char                   *peer_ip;
+       int                     port;
+       char                   *name;
+
+       int                     flags;
+       int                     closed;
+};
+
+int global_id = 0;
+
+static void disable_write_queue(struct tdnbd_data *prv);
+
+
+/* -- fdreceiver bits and pieces -- */
+
+static void
+tdnbd_stash_passed_fd(int fd, char *msg, void *data __attribute__((unused)))
+{
+       int free_index = -1;
+       int i;
+       for (i = 0; i < N_PASSED_FDS; i++)
+               if (passed_fds[i].fd == -1) {
+                       free_index = i;
+                       break;
+               }
+
+       if (free_index == -1) {
+               ERROR("Error - more than %d fds passed! cannot stash another",
+                               N_PASSED_FDS);
+               close(fd);
+               return;
+       }
+
+       passed_fds[free_index].fd = fd;
+       strncpy(passed_fds[free_index].id, msg,
+                       sizeof(passed_fds[free_index].id));
+       gettimeofday(&passed_fds[free_index].t, NULL);
+
+}
+
+static int
+tdnbd_retreive_passed_fd(const char *name)
+{
+       int fd, i;
+
+       for (i = 0; i < N_PASSED_FDS; i++) {
+               if (strncmp(name, passed_fds[i].id,
+                                       sizeof(passed_fds[i].id)) == 0) {
+                       fd = passed_fds[i].fd;
+                       passed_fds[i].fd = -1;
+                       return fd;
+               }
+       }
+
+       ERROR("Couldn't find the fd named: %s", name);
+
+       return -1;
+}
+
+void
+tdnbd_fdreceiver_start(void)
+{
+       char fdreceiver_path[TAPDISK_NBDCLIENT_MAX_PATH_LEN];
+       int i;
+
+       /* initialise the passed fds list */
+       for (i = 0; i < N_PASSED_FDS; i++)
+               passed_fds[i].fd = -1;
+
+       snprintf(fdreceiver_path, TAPDISK_NBDCLIENT_MAX_PATH_LEN,
+                       "%s%d", TAPDISK_NBDCLIENT_LISTEN_SOCK_PATH, getpid());
+
+       fdreceiver = td_fdreceiver_start(fdreceiver_path,
+                       tdnbd_stash_passed_fd, NULL);
+
+}
+
+void
+tdnbd_fdreceiver_stop(void)
+{
+       if (fdreceiver)
+               td_fdreceiver_stop(fdreceiver);
+}
+
+static void
+__cancel_req(int i, struct td_nbd_request *pos, int e)
+{
+       char handle[9];
+       memcpy(handle, pos->nreq.handle, 8);
+       handle[8] = 0;
+       INFO("Entry %d: handle='%s' type=%d -- reporting errno: %d",
+                       i, handle, ntohl(pos->nreq.type), e);
+
+       if (pos->timeout_event >= 0) {
+               tapdisk_server_unregister_event(pos->timeout_event);
+               pos->timeout_event = -1;
+       }
+
+       td_complete_request(pos->treq, e);
+}
+
+static void
+tdnbd_disable(struct tdnbd_data *prv, int e)
+{
+       struct td_nbd_request *pos, *q;
+       int i = 0;
+
+       INFO("NBD client full-disable");
+
+       tapdisk_server_unregister_event(prv->writer_event_id);
+       tapdisk_server_unregister_event(prv->reader_event_id);
+
+    TAILQ_FOREACH_SAFE(pos, &prv->sent_reqs, queue, q)
+               __cancel_req(i++, pos, e);
+
+    TAILQ_FOREACH_SAFE(pos, &prv->pending_reqs, queue, q)
+               __cancel_req(i++, pos, e);
+
+       INFO("Setting closed");
+       prv->closed = 3;
+}
+
+/* NBD writer queue */
+
+/* Return code: how much is left to write, or a negative error code */
+static int
+tdnbd_write_some(int fd, struct nbd_queued_io *data)
+{
+       int left = data->len - data->so_far;
+       int rc;
+       char *code;
+
+       while (left > 0) {
+               rc = send(fd, data->buffer + data->so_far, left, 0);
+
+               if (rc == -1) {
+                       if ((errno == EAGAIN) || (errno == EWOULDBLOCK))
+                               return left;
+
+                       code = strerror(errno);
+                       ERROR("Bad return code %d from send (%s)", rc,
+                                       (code == 0 ? "unknown" : code));
+                       return rc;
+               }
+
+               if (rc == 0) {
+                       ERROR("Server shutdown prematurely in write_some");
+                       return -1;
+               }
+
+               left -= rc;
+               data->so_far += rc;
+       }
+
+       return left;
+}
+
+static int
+tdnbd_read_some(int fd, struct nbd_queued_io *data)
+{
+       int left = data->len - data->so_far;
+       int rc;
+       char *code;
+
+       while (left > 0) {
+               rc = recv(fd, data->buffer + data->so_far, left, 0);
+
+               if (rc == -1) {
+
+                       if ((errno == EAGAIN) || (errno == EWOULDBLOCK))
+                               return left;
+
+                       code = strerror(errno);
+                       ERROR("Bad return code %d from send (%s)", rc,
+                                       (code == 0 ? "unknown" : code));
+                       return rc;
+               }
+
+               if (rc == 0) {
+                       ERROR("Server shutdown prematurely in read_some");
+                       return -1;
+               }
+
+               data->so_far += rc;
+               left -= rc;
+       }
+
+       return left;
+}
+
+static void
+tdnbd_timeout_cb(event_id_t eb, char mode __attribute__((unused)), void *data)
+{
+       struct tdnbd_data *prv = data;
+       ERROR("Timeout!: %d", eb);
+       tdnbd_disable(prv, ETIMEDOUT);
+}
+
+static void
+tdnbd_writer_cb(event_id_t eb __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       struct td_nbd_request *pos, *q;
+       struct tdnbd_data *prv = data;
+
+    TAILQ_FOREACH_SAFE(pos, &prv->pending_reqs, queue, q) {
+               if (tdnbd_write_some(prv->socket, &pos->header) > 0)
+                       return;
+
+               if (ntohl(pos->nreq.type) == NBD_CMD_WRITE) {
+                       if (tdnbd_write_some(prv->socket, &pos->body) > 0)
+                               return;
+               }
+
+               if (ntohl(pos->nreq.type) == NBD_CMD_DISC) {
+                       INFO("sent close request");
+                       /*
+                        * We don't expect a response from a DISC, so move the
+                        * request back onto the free list
+                        */
+            TAILQ_MOVE_HEAD(pos, &prv->pending_reqs, &prv->free_reqs, queue);
+                       prv->nr_free_count++;
+                       prv->closed = 2;
+               } else
+            TAILQ_MOVE_HEAD(pos, &prv->pending_reqs, &prv->sent_reqs, queue);
+       }
+
+       /* If we're here, we've written everything */
+
+       disable_write_queue(prv);
+
+       if (prv->closed == 2)
+               tdnbd_disable(prv, EIO);
+
+       return;
+}
+
+static int
+enable_write_queue(struct tdnbd_data *prv)
+{
+       if (prv->writer_event_id >= 0)
+               return 0;
+
+       prv->writer_event_id =
+               tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD,
+                               prv->socket,
+                               0,
+                               tdnbd_writer_cb,
+                               prv);
+
+       return prv->writer_event_id;
+}
+
+static void
+disable_write_queue(struct tdnbd_data *prv)
+{
+       if (prv->writer_event_id < 0)
+               return;
+
+       tapdisk_server_unregister_event(prv->writer_event_id);
+
+       prv->writer_event_id = -1;
+}
+
+static int
+tdnbd_queue_request(struct tdnbd_data *prv, int type, uint64_t offset,
+               char *buffer, uint32_t length, td_request_t treq, int fake)
+{
+    struct td_nbd_request *req;
+    int id;
+
+       if (prv->nr_free_count == 0)
+               return -EBUSY;
+
+       if (prv->closed == 3) {
+               td_complete_request(treq, -ETIMEDOUT);
+               return -ETIMEDOUT;
+       }
+
+    req = TAILQ_FIRST(&prv->free_reqs);
+
+       /* fill in the request */
+
+       req->treq = treq;
+       id = global_id++;
+       snprintf(req->nreq.handle, 8, "td%05x", id % 0xffff);
+
+       /* No response from a disconnect, so no need for a timeout */
+       if (type != NBD_CMD_DISC) {
+               req->timeout_event = tapdisk_server_register_event(
+                               SCHEDULER_POLL_TIMEOUT,
+                               -1, /* dummy */
+                               NBD_TIMEOUT,
+                               tdnbd_timeout_cb,
+                               prv);
+       } else {
+               req->timeout_event = -1;
+       }
+
+       INFO("request: %s timeout %d", req->nreq.handle, req->timeout_event);
+
+       req->nreq.magic = htonl(NBD_REQUEST_MAGIC);
+       req->nreq.type = htonl(type);
+       req->nreq.from = htonll(offset);
+       req->nreq.len = htonl(length);
+       req->header.buffer = (char *)&req->nreq;
+       req->header.len = sizeof(req->nreq);
+       req->header.so_far = 0;
+       req->body.buffer = buffer;
+       req->body.len = length;
+       req->body.so_far = 0;
+       req->fake = fake;
+
+    TAILQ_INSERT_TAIL(&prv->pending_reqs, req, queue);
+       prv->nr_free_count--;
+
+       if (prv->writer_event_id < 0)
+               enable_write_queue(prv);
+
+       return 0;
+}
+
+/* NBD Reader callback */
+
+static void
+tdnbd_reader_cb(event_id_t eb __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       char handle[9];
+       int do_disable = 0;
+
+       /* Check to see if we're in the middle of reading a response already */
+       struct tdnbd_data *prv = data;
+       int rc = tdnbd_read_some(prv->socket, &prv->cur_reply_qio);
+
+       if (rc < 0) {
+               ERROR("Error reading reply header: %d", rc);
+               tdnbd_disable(prv, EIO);
+               return;
+       }
+
+       if (rc > 0)
+               return; /* need more data */
+
+       /* Got a header. */
+       if (prv->current_reply.error != 0) {
+               ERROR("Error in reply: %d", prv->current_reply.error);
+               tdnbd_disable(prv, EIO);
+               return;
+       }
+
+       /* Have we found the request yet? */
+       if (prv->curr_reply_req == NULL) {
+               struct td_nbd_request *pos, *q;
+        TAILQ_FOREACH_SAFE(pos, &prv->sent_reqs, queue, q) {
+                       if (memcmp(pos->nreq.handle, prv->current_reply.handle,
+                                               8) == 0) {
+                               prv->curr_reply_req = pos;
+                               break;
+                       }
+               }
+
+               if (prv->curr_reply_req == NULL) {
+                       memcpy(handle, prv->current_reply.handle, 8);
+                       handle[8] = 0;
+
+                       ERROR("Couldn't find request corresponding to reply "
+                                       "(reply handle='%s')", handle);
+                       tdnbd_disable(prv, EIO);
+                       return;
+               }
+       }
+
+       switch(ntohl(prv->curr_reply_req->nreq.type)) {
+       case NBD_CMD_READ:
+               rc = tdnbd_read_some(prv->socket,
+                               &prv->curr_reply_req->body);
+
+               if (rc < 0) {
+                       ERROR("Error reading body of request: %d", rc);
+                       tdnbd_disable(prv, EIO);
+                       return;
+               }
+
+               if (rc > 0)
+                       return; /* need more data */
+
+               td_complete_request(prv->curr_reply_req->treq, 0);
+
+               break;
+       case NBD_CMD_WRITE:
+               td_complete_request(prv->curr_reply_req->treq, 0);
+
+               break;
+       default:
+               ERROR("Unhandled request response: %d",
+                               ntohl(prv->curr_reply_req->nreq.type));
+               do_disable = 1;
+               return;
+       }
+
+       /* remove the state */
+    TAILQ_MOVE_HEAD(prv->curr_reply_req, &prv->sent_reqs, &prv->free_reqs,
+            queue);
+       prv->nr_free_count++;
+
+       prv->cur_reply_qio.so_far = 0;
+       if (prv->curr_reply_req->timeout_event >= 0) {
+               tapdisk_server_unregister_event(
+                               prv->curr_reply_req->timeout_event);
+       }
+
+       prv->curr_reply_req = NULL;
+
+       /*
+        * NB: do this here otherwise we cancel the request that has just been
+        * moved
+        */
+       if (do_disable)
+               tdnbd_disable(prv, EIO);
+}
+
+static int
+tdnbd_wait_read(int fd)
+{
+       struct timeval select_tv;
+       fd_set socks;
+       int rc;
+
+       FD_ZERO(&socks);
+       FD_SET(fd, &socks);
+       select_tv.tv_sec = 10;
+       select_tv.tv_usec = 0;
+       rc = select(fd + 1, &socks, NULL, NULL, &select_tv);
+       return rc;
+}
+
+static int
+tdnbd_nbd_negotiate(struct tdnbd_data *prv, td_driver_t *driver)
+{
+#define RECV_BUFFER_SIZE 256
+       int rc;
+       char buffer[RECV_BUFFER_SIZE];
+       uint64_t magic;
+       uint64_t size;
+       uint32_t flags;
+       int padbytes = 124;
+       int sock = prv->socket;
+
+       /*
+        * NBD negotiation protocol:
+        *
+        * Server sends 'NBDMAGIC'
+        * then it sends 0x00420281861253L
+        * then it sends a 64 bit bigendian size
+        * then it sends a 32 bit bigendian flags
+        * then it sends 124 bytes of nothing
+        */
+
+       /*
+        * We need to limit the time we spend in this function as we're still
+        * using blocking IO at this point
+        */
+       if (tdnbd_wait_read(sock) <= 0) {
+               ERROR("Timeout in nbd_negotiate");
+               close(sock);
+               return -1;
+       }
+
+       rc = recv(sock, buffer, 8, 0);
+       if (rc < 8) {
+               ERROR("Short read in negotiation(1) (%d)\n", rc);
+               close(sock);
+               return -1;
+       }
+
+       if (memcmp(buffer, "NBDMAGIC", 8) != 0) {
+               buffer[8] = 0;
+               ERROR("Error in NBD negotiation: got '%s'", buffer);
+               close(sock);
+               return -1;
+       }
+
+       if (tdnbd_wait_read(sock) <= 0) {
+               ERROR("Timeout in nbd_negotiate");
+               close(sock);
+               return -1;
+       }
+
+       rc = recv(sock, &magic, sizeof(magic), 0);
+       if (rc < 8) {
+               ERROR("Short read in negotiation(2) (%d)\n", rc);
+
+               return -1;
+       }
+
+       if (ntohll(magic) != NBD_NEGOTIATION_MAGIC) {
+               ERROR("Not enough magic in negotiation(2) (%"PRIu64")\n",
+                               ntohll(magic));
+               close(sock);
+               return -1;
+       }
+
+       if (tdnbd_wait_read(sock) <= 0) {
+               ERROR("Timeout in nbd_negotiate");
+               close(sock);
+               return -1;
+       }
+
+       rc = recv(sock, &size, sizeof(size), 0);
+       if (rc < sizeof(size)) {
+               ERROR("Short read in negotiation(3) (%d)\n", rc);
+               close(sock);
+               return -1;
+       }
+
+       INFO("Got size: %"PRIu64"", ntohll(size));
+
+       driver->info.size = ntohll(size) >> SECTOR_SHIFT;
+       driver->info.sector_size = DEFAULT_SECTOR_SIZE;
+       driver->info.info = 0;
+
+       if (tdnbd_wait_read(sock) <= 0) {
+               ERROR("Timeout in nbd_negotiate");
+               close(sock);
+               return -1;
+       }
+
+       rc = recv(sock, &flags, sizeof(flags), 0);
+       if (rc < sizeof(flags)) {
+               ERROR("Short read in negotiation(4) (%d)\n", rc);
+               close(sock);
+               return -1;
+       }
+
+       INFO("Got flags: %"PRIu32"", ntohl(flags));
+
+       while (padbytes > 0) {
+               if (tdnbd_wait_read(sock) <= 0) {
+                       ERROR("Timeout in nbd_negotiate");
+                       close(sock);
+                       return -1;
+               }
+
+               rc = recv(sock, buffer, padbytes, 0);
+               if (rc < 0) {
+                       ERROR("Bad read in negotiation(5) (%d)\n", rc);
+                       close(sock);
+                       return -1;
+               }
+               padbytes -= rc;
+       }
+
+       INFO("Successfully connected to NBD server");
+
+       fcntl(sock, F_SETFL, O_NONBLOCK);
+
+       return 0;
+}
+
+static int
+tdnbd_connect_import_session(struct tdnbd_data *prv, td_driver_t* driver)
+{
+       int sock;
+       int opt = 1;
+       int rc;
+
+       sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+       if (sock < 0) {
+               ERROR("Could not create socket: %s\n", strerror(errno));
+               return -1;
+       }
+
+       rc = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void *)&opt,
+                       sizeof(opt));
+       if (rc < 0) {
+               ERROR("Could not set TCP_NODELAY: %s\n", strerror(errno));
+               return -1;
+       }
+
+       prv->remote = (struct sockaddr_in *)malloc(
+                       sizeof(struct sockaddr_in *));
+       if (!prv->remote) {
+               ERROR("struct sockaddr_in malloc failure\n");
+               close(sock);
+               return -1;
+       }
+       prv->remote->sin_family = AF_INET;
+       rc = inet_pton(AF_INET, prv->peer_ip, &(prv->remote->sin_addr.s_addr));
+       if (rc < 0) {
+               ERROR("Could not create inaddr: %s\n", strerror(errno));
+               free(prv->remote);
+               prv->remote = NULL;
+               close(sock);
+               return -1;
+       }
+       else if (rc == 0) {
+               ERROR("inet_pton parse error\n");
+               free(prv->remote);
+               prv->remote = NULL;
+               close(sock);
+               return -1;
+       }
+       prv->remote->sin_port = htons(prv->port);
+
+       if (connect(sock, (struct sockaddr *)prv->remote,
+                               sizeof(struct sockaddr)) < 0) {
+               ERROR("Could not connect to peer: %s\n", strerror(errno));
+               close(sock);
+               return -1;
+       }
+
+       prv->socket = sock;
+
+       return tdnbd_nbd_negotiate(prv, driver);
+}
+
+/* -- interface -- */
+
+static int tdnbd_close(td_driver_t*, struct tqh_td_image_handle *);
+
+static int
+tdnbd_open(td_driver_t* driver, const char* name, td_flag_t flags)
+{
+       struct tdnbd_data *prv;
+       char peer_ip[256];
+       int port;
+       int rc;
+       int i;
+
+       driver->info.sector_size = 512;
+       driver->info.info = 0;
+
+       prv = (struct tdnbd_data *)driver->data;
+       memset(prv, 0, sizeof(struct tdnbd_data));
+
+       INFO("Opening nbd export to %s (flags=%x)\n", name, flags);
+
+       prv->writer_event_id = -1;
+       TAILQ_INIT(&prv->sent_reqs);
+       TAILQ_INIT(&prv->pending_reqs);
+       TAILQ_INIT(&prv->free_reqs);
+       for (i = 0; i < MAX_NBD_REQS; i++) {
+               prv->requests[i].timeout_event = -1;
+        TAILQ_INSERT_HEAD(&prv->free_reqs, &prv->requests[i], queue);
+       }
+       prv->nr_free_count = MAX_NBD_REQS;
+       prv->cur_reply_qio.buffer = (char *)&prv->current_reply;
+       prv->cur_reply_qio.len = sizeof(struct nbd_reply);
+       rc = sscanf(name, "%255[^:]:%d", peer_ip, &port);
+       if (rc == 2) {
+               prv->peer_ip = malloc(strlen(peer_ip) + 1);
+               if (!prv->peer_ip) {
+                       ERROR("Failure to malloc for NBD destination");
+                       return -1;
+               }
+               strcpy(prv->peer_ip, peer_ip);
+               prv->port = port;
+               prv->name = NULL;
+               INFO("Export peer=%s port=%d\n", prv->peer_ip, prv->port);
+               if (tdnbd_connect_import_session(prv, driver) < 0)
+                       return -1;
+
+       } else {
+               prv->socket = tdnbd_retreive_passed_fd(name);
+               if (prv->socket < 0) {
+                       ERROR("Couldn't find fd named: %s", name);
+                       return -1;
+               }
+               INFO("Found passed fd. Connecting...");
+               prv->remote = NULL;
+               prv->peer_ip = NULL;
+               prv->name = strdup(name);
+               prv->port = -1;
+               if (tdnbd_nbd_negotiate(prv, driver) < 0) {
+                       ERROR("Failed to negotiate");
+                       return -1;
+               }
+       }
+
+       prv->reader_event_id =
+               tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                               prv->socket, 0,
+                               tdnbd_reader_cb,
+                               (void *)prv);
+
+       prv->flags = flags;
+       prv->closed = 0;
+
+       if (flags & TD_OPEN_SECONDARY)
+               INFO("Opening in secondary mode: Read requests will be "
+                               "forwarded");
+
+       return 0;
+
+}
+
+static int
+tdnbd_close(td_driver_t* driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       struct tdnbd_data *prv = (struct tdnbd_data *)driver->data;
+       td_request_t treq;
+
+       bzero(&treq, sizeof(treq));
+
+       if (prv->closed == 3) {
+               INFO("NBD close: already decided that the connection is dead.");
+               if (prv->socket >= 0)
+                       close(prv->socket);
+               prv->socket = -1;
+               return 0;
+       }
+
+       /* Send a close packet */
+
+       INFO("Sending disconnect request");
+       tdnbd_queue_request(prv, NBD_CMD_DISC, 0, 0, 0, treq, 0);
+
+       INFO("Switching socket to blocking IO mode");
+       fcntl(prv->socket, F_SETFL, fcntl(prv->socket, F_GETFL) & ~O_NONBLOCK);
+
+       INFO("Writing disconnection request");
+       tdnbd_writer_cb(0, 0, prv);
+
+       INFO("Written");
+
+       if (prv->peer_ip) {
+               free(prv->peer_ip);
+               prv->peer_ip = NULL;
+       }
+
+       if (prv->name) {
+               tdnbd_stash_passed_fd(prv->socket, prv->name, 0);
+               free(prv->name);
+       } else {
+               if (prv->socket >= 0)
+                       close(prv->socket);
+               prv->socket = -1;
+       }
+
+       return 0;
+}
+
+static void
+tdnbd_queue_read(td_driver_t* driver, td_request_t treq)
+{
+       struct tdnbd_data *prv = (struct tdnbd_data *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       if (prv->flags & TD_OPEN_SECONDARY)
+               td_forward_request(treq);
+       else
+               tdnbd_queue_request(prv, NBD_CMD_READ, offset, treq.buf, size,
+                               treq, 0);
+
+}
+
+static void
+tdnbd_queue_write(td_driver_t* driver, td_request_t treq)
+{
+       struct tdnbd_data *prv = (struct tdnbd_data *)driver->data;
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       tdnbd_queue_request(prv, NBD_CMD_WRITE,
+                       offset, treq.buf, size, treq, 0);
+}
+
+static int
+tdnbd_get_parent_id(td_driver_t* driver __attribute__((unused)),
+        td_disk_id_t* id __attribute__((unused)))
+{
+       return TD_NO_PARENT;
+}
+
+static int
+tdnbd_validate_parent(td_driver_t *driver __attribute__((unused)),
+               td_driver_t *parent __attribute__((unused)),
+        td_flag_t flags __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_nbd = {
+       .disk_type          = "tapdisk_nbd",
+       .private_data_size  = sizeof(struct tdnbd_data),
+       .flags              = 0,
+       .td_open            = tdnbd_open,
+       .td_close           = tdnbd_close,
+       .td_queue_read      = tdnbd_queue_read,
+       .td_queue_write     = tdnbd_queue_write,
+       .td_get_parent_id   = tdnbd_get_parent_id,
+       .td_validate_parent = tdnbd_validate_parent,
+};
+
diff --git a/tools/blktap3/drivers/block-ram.c 
b/tools/blktap3/drivers/block-ram.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-ram.c
@@ -0,0 +1,267 @@
+/* 
+ * Copyright (c) 2007, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <string.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+
+void *img;
+long int   disksector_size;
+long int   disksize;
+long int   diskinfo;
+static int connections = 0;
+
+struct tdram_state {
+        int fd;
+};
+
+/*Get Image size, secsize*/
+static int get_image_info(int fd, td_disk_info_t *info)
+{
+       int ret;
+       struct stat stat;
+
+       ret = fstat(fd, &stat);
+       if (ret != 0) {
+               DPRINTF("ERROR: fstat failed, Couldn't stat image");
+               return -EINVAL;
+       }
+
+       if (S_ISBLK(stat.st_mode)) {
+               /*Accessing block device directly*/
+               info->size = 0;
+               if (ioctl(fd,BLKGETSIZE,&info->size)!=0) {
+                       DPRINTF("ERR: BLKGETSIZE failed, couldn't stat image");
+                       return -EINVAL;
+               }
+
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+
+               /*Get the sector size*/
+#if defined(BLKSSZGET)
+               {
+                       info->sector_size = DEFAULT_SECTOR_SIZE;
+                       ioctl(fd, BLKSSZGET, &info->sector_size);
+                       
+                       if (info->sector_size != DEFAULT_SECTOR_SIZE)
+                               DPRINTF("Note: sector size is %ld (not %d)\n",
+                                       info->sector_size, DEFAULT_SECTOR_SIZE);
+               }
+#else
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+#endif
+
+       } else {
+               /*Local file? try fstat instead*/
+               info->size = (stat.st_size >> SECTOR_SHIFT);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(info->size << SECTOR_SHIFT),
+                       (long long unsigned)info->size);
+       }
+
+       if (info->size == 0) {          
+               info->size =((uint64_t) MAX_RAMDISK_SIZE);
+               info->sector_size = DEFAULT_SECTOR_SIZE;
+       }
+       info->info = 0;
+
+        /*Store variables locally*/
+       disksector_size = info->sector_size;
+       disksize        = info->size;
+       diskinfo        = info->info;
+       DPRINTF("Image sector_size: \n\t[%lu]\n",
+               info->sector_size);
+
+       return 0;
+}
+
+/* Open the disk file and initialize ram state. */
+int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags)
+{
+       char *p;
+       uint64_t size;
+       int i, fd, ret = 0, count = 0, o_flags;
+       struct tdram_state *prv = (struct tdram_state *)driver->data;
+
+       connections++;
+
+       if (connections > 1) {
+               driver->info.sector_size = disksector_size;
+               driver->info.size        = disksize;
+               driver->info.info        = diskinfo; 
+               DPRINTF("Image already open, returning parameters:\n");
+               DPRINTF("Image size: \n\tpre sector_shift  [%llu]\n\tpost "
+                       "sector_shift [%llu]\n",
+                       (long long unsigned)(driver->info.size << SECTOR_SHIFT),
+                       (long long unsigned)driver->info.size);
+               DPRINTF("Image sector_size: \n\t[%lu]\n",
+                       driver->info.sector_size);
+
+               prv->fd = -1;
+               goto done;
+       }
+
+       /* Open the file */
+       o_flags = O_DIRECT | O_LARGEFILE | 
+               ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
+        fd = open(name, o_flags);
+
+        if ((fd == -1) && (errno == EINVAL)) {
+
+                /* Maybe O_DIRECT isn't supported. */
+               o_flags &= ~O_DIRECT;
+                fd = open(name, o_flags);
+                if (fd != -1) DPRINTF("WARNING: Accessing image without"
+                                     "O_DIRECT! (%s)\n", name);
+
+        } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name);
+       
+        if (fd == -1) {
+               DPRINTF("Unable to open [%s]!\n",name);
+               ret = 0 - errno;
+               goto done;
+        }
+
+        prv->fd = fd;
+
+       ret = get_image_info(fd, &driver->info);
+       size = MAX_RAMDISK_SIZE;
+
+       if (driver->info.size > size) {
+               DPRINTF("Disk exceeds limit, must be less than [%d]MB",
+                       (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20);
+               return -ENOMEM;
+       }
+
+       /*Read the image into memory*/
+       if (posix_memalign(&img, DEFAULT_SECTOR_SIZE,
+                          driver->info.size << SECTOR_SHIFT)) {
+               DPRINTF("Mem malloc failed\n");
+               return -errno;
+       }
+       p = img;
+       DPRINTF("Reading %llu bytes.......",
+               (long long unsigned)driver->info.size << SECTOR_SHIFT);
+
+       for (i = 0; i < driver->info.size; i++) {
+               ret = read(prv->fd, p, driver->info.sector_size);
+               if (ret != driver->info.sector_size) {
+                       DPRINTF("ret = %d, errno = %d\n", ret, errno);
+                       ret = 0 - errno;
+                       break;
+               } else {
+                       count += ret;
+                       p = img + count;
+               }
+       }
+       DPRINTF("[%d]\n",count);
+       if (count != driver->info.size << SECTOR_SHIFT) {
+               ret = -1;
+       } else {
+               ret = 0;
+       }
+
+done:
+       return ret;
+}
+
+void tdram_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+
+       memcpy(treq.buf, img + offset, size);
+
+       td_complete_request(treq, 0);
+}
+
+void tdram_queue_write(td_driver_t *driver, td_request_t treq)
+{
+       int      size    = treq.secs * driver->info.sector_size;
+       uint64_t offset  = treq.sec * (uint64_t)driver->info.sector_size;
+       
+       /* We assume that write access is controlled
+        * at a higher level for multiple disks */
+       memcpy(img + offset, treq.buf, size);
+
+       td_complete_request(treq, 0);
+}
+
+int tdram_close(td_driver_t *driver __attribute__((unused)),
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       connections--;
+       
+       return 0;
+}
+
+int tdram_get_parent_id(td_driver_t *driver __attribute__((unused)),
+        td_disk_id_t *id __attribute__((unused)))
+{
+       return TD_NO_PARENT;
+}
+
+int tdram_validate_parent(td_driver_t *driver __attribute__((unused)),
+                         td_driver_t *pdriver __attribute__((unused)),
+              td_flag_t flags __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+struct tap_disk tapdisk_ram = {
+       .disk_type          = "tapdisk_ram",
+       .flags              = 0,
+       .private_data_size  = sizeof(struct tdram_state),
+       .td_open            = tdram_open,
+       .td_close           = tdram_close,
+       .td_queue_read      = tdram_queue_read,
+       .td_queue_write     = tdram_queue_write,
+       .td_get_parent_id   = tdram_get_parent_id,
+       .td_validate_parent = tdram_validate_parent,
+       .td_debug           = NULL,
+};
diff --git a/tools/blktap3/drivers/block-valve.c 
b/tools/blktap3/drivers/block-valve.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-valve.c
@@ -0,0 +1,703 @@
+/*
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "tapdisk.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+#include "block-valve.h"
+
+typedef struct td_valve td_valve_t;
+typedef struct td_valve_request td_valve_request_t;
+
+TAILQ_HEAD(tqh_td_valve_request, td_valve_request);
+
+struct td_valve_request {
+       td_request_t            treq;
+       int                     secs;
+
+    TAILQ_ENTRY(td_valve_request) entry;
+       td_valve_t             *valve;
+};
+
+struct td_valve_stats {
+       unsigned long long      stor;
+       unsigned long long      forw;
+};
+
+struct td_valve {
+       char                   *brname;
+       unsigned long           flags;
+
+       int                     sock;
+       event_id_t              sock_id;
+
+       event_id_t              sched_id;
+       event_id_t              retry_id;
+
+       unsigned int            cred;
+       unsigned int            need;
+       unsigned int            done;
+
+       struct tqh_td_valve_request        stor;
+       struct tqh_td_valve_request        forw;
+
+       td_valve_request_t      reqv[MAX_REQUESTS];
+       td_valve_request_t     *free[MAX_REQUESTS];
+       int                     n_free;
+
+       struct td_valve_stats   stats;
+};
+
+//list_for_each_entry_safe(_req, _next, &(_valve)->stor, entry)
+#define td_valve_for_each_stored_request(_req, _next, _valve)          \
+    TAILQ_FOREACH_SAFE(_req, &(_valve)->stor, entry, _next)
+
+//list_for_each_entry_safe(_req, _next, &(_valve)->forw, entry)
+#define td_valve_for_each_forwarded_request(_req, _next, _valve)       \
+    TAILQ_FOREACH_SAFE(_req, &(_valve)->forw, entry, _next)
+
+#define TD_VALVE_CONNECT_INTERVAL 2 /* s */
+
+#define TD_VALVE_RDLIMIT  (1<<0)
+#define TD_VALVE_WRLIMIT  (1<<1)
+#define TD_VALVE_KILLED   (1<<31)
+
+static void valve_schedule_retry(td_valve_t *);
+static void valve_conn_receive(td_valve_t *);
+static void valve_conn_request(td_valve_t *, unsigned long);
+static void valve_forward_stored_requests(td_valve_t *);
+static void valve_kill(td_valve_t *);
+
+#define DBG(_f, _a...)    if (1) { tlog_syslog(TLOG_DBG, _f, ##_a); }
+#define INFO(_f, _a...)   tlog_syslog(TLOG_INFO, "valve: " _f, ##_a)
+#define WARN(_f, _a...)   tlog_syslog(TLOG_WARN, "WARNING: "_f " in %s:%d", \
+                                     ##_a, __func__, __LINE__)
+#define ERR(_f, _a...)    tlog_syslog(TLOG_WARN, "ERROR: " _f " in %s:%d", \
+                                     ##_a, __func__, __LINE__)
+#define VERR(_err, _f, _a...) tlog_syslog(TLOG_WARN,                    \
+                                         "ERROR: err=%d (%s), " _f ".", \
+                                         _err, strerror(-(_err)), ##_a)
+#undef  PERROR
+#define PERROR(_f, _a...) VERR(-errno, _f, ##_a)
+
+#define BUG() do {                                             \
+               ERR("Aborting");                                \
+               td_panic();                                     \
+       } while (0)
+
+#define BUG_ON(_cond)                                          \
+       if (unlikely(_cond)) {                                  \
+               ERR("(%s) = %ld", #_cond, (long)(_cond));       \
+               BUG();                                          \
+       }
+
+#define WARN_ON(_cond) ({                                      \
+       int __cond = _cond;                                     \
+       if (unlikely(__cond))                                   \
+               WARN("(%s) = %ld", #_cond, (long)(_cond));      \
+       __cond;                                         \
+})
+
+#define TREQ_SIZE(_treq) ((unsigned int)(_treq.secs) << 9)
+
+static td_valve_request_t *
+valve_alloc_request(td_valve_t *valve)
+{
+       td_valve_request_t *req = NULL;
+
+       if (valve->n_free)
+               req = valve->free[--valve->n_free];
+
+       return req;
+}
+
+static void
+valve_free_request(td_valve_t *valve, td_valve_request_t *req,
+        struct tqh_td_valve_request *head)
+{
+       BUG_ON(valve->n_free >= ARRAY_SIZE(valve->free));
+    if (head)
+        TAILQ_REMOVE(head, req, entry);
+       valve->free[valve->n_free++] = req;
+}
+
+static void
+__valve_sock_event(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *private)
+{
+       td_valve_t *valve = private;
+
+       valve_conn_receive(valve);
+
+       valve_forward_stored_requests(valve);
+}
+
+static void
+valve_set_done_pending(td_valve_t *valve)
+{
+       WARN_ON(valve->done == 0);
+       tapdisk_server_mask_event(valve->sched_id, 0);
+}
+
+static void
+valve_clear_done_pending(td_valve_t *valve)
+{
+       WARN_ON(valve->done != 0);
+       tapdisk_server_mask_event(valve->sched_id, 1);
+}
+
+static void
+__valve_sched_event(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *private)
+{
+       td_valve_t *valve = private;
+
+       if (likely(valve->done > 0))
+               /* flush valve->done */
+               valve_conn_request(valve, 0);
+}
+
+static void
+valve_sock_close(td_valve_t *valve)
+{
+       if (valve->sock >= 0) {
+               close(valve->sock);
+               valve->sock = -1;
+       }
+
+       if (valve->sock_id >= 0) {
+               tapdisk_server_unregister_event(valve->sock_id);
+               valve->sock_id = -1;
+       }
+
+       if (valve->sched_id >= 0) {
+               tapdisk_server_unregister_event(valve->sched_id);
+               valve->sched_id = -1;
+       }
+}
+
+static int
+valve_sock_open(td_valve_t *valve)
+{
+       struct sockaddr_un addr = { .sun_family = AF_UNIX };
+       int s, id, err;
+
+       s = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (s < 0) {
+               PERROR("socket");
+               err = -errno;
+               goto fail;
+       }
+
+       valve->sock = s;
+
+       if (valve->brname[0] == '/')
+               strncpy(addr.sun_path, valve->brname,
+                       sizeof(addr.sun_path));
+       else
+               snprintf(addr.sun_path, sizeof(addr.sun_path),
+                        "%s/%s", TD_VALVE_SOCKDIR, valve->brname);
+
+       err = connect(valve->sock, &addr, sizeof(addr));
+       if (err) {
+               err = -errno;
+               goto fail;
+       }
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                          valve->sock, 0,
+                                          __valve_sock_event,
+                                          valve);
+       if (id < 0) {
+               err = id;
+               goto fail;
+       }
+
+       valve->sock_id = id;
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                          -1, 0,
+                                          __valve_sched_event,
+                                          valve);
+       if (id < 0) {
+               err = id;
+               goto fail;
+       }
+
+       valve->sched_id = id;
+
+       INFO("Connected to %s", addr.sun_path);
+
+       valve->cred = 0;
+       valve->need = 0;
+       valve->done = 0;
+
+       valve_clear_done_pending(valve);
+
+       return 0;
+
+fail:
+       valve_sock_close(valve);
+       return err;
+}
+
+static int
+valve_sock_send(td_valve_t *valve, const void *msg, size_t size)
+{
+       ssize_t n;
+
+       n = send(valve->sock, msg, size, MSG_DONTWAIT);
+       if (n < 0)
+               return -errno;
+       if (n != size)
+               return -EPROTO;
+
+       return 0;
+}
+
+static int
+valve_sock_recv(td_valve_t *valve, void *msg, size_t size)
+{
+       ssize_t n;
+
+       n = recv(valve->sock, msg, size, MSG_DONTWAIT);
+       if (n < 0)
+               return -errno;
+
+       return n;
+}
+
+static void
+__valve_retry_timeout(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *private)
+{
+       td_valve_t *valve = private;
+       int err;
+
+       err = valve_sock_open(valve);
+       if (!err)
+               tapdisk_server_unregister_event(valve->retry_id);
+}
+
+static void
+valve_schedule_retry(td_valve_t *valve)
+{
+       int id;
+
+       BUG_ON(valve->sock_id >= 0);
+
+       id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+                                          -1, TD_VALVE_CONNECT_INTERVAL,
+                                          __valve_retry_timeout,
+                                          valve);
+       BUG_ON(id < 0);
+
+       valve->retry_id = id;
+}
+
+static void
+valve_conn_open(td_valve_t *valve)
+{
+       int err;
+
+       BUG_ON(valve->flags & TD_VALVE_KILLED);
+
+       err = valve_sock_open(valve);
+       if (err) {
+               WARN("%s: %s", valve->brname, strerror(-err));
+               valve_schedule_retry(valve);
+       }
+}
+
+static void
+valve_conn_close(td_valve_t *valve, int reset)
+{
+       td_valve_request_t *req, *next;
+
+       valve_sock_close(valve);
+
+       if (reset)
+               td_valve_for_each_stored_request(req, next, valve) {
+                       td_forward_request(req->treq);
+                       valve->stats.forw++;
+                       valve_free_request(valve, req, &valve->stor);
+               }
+
+       WARN_ON(!TAILQ_EMPTY(&valve->stor));
+}
+
+static void
+valve_conn_reset(td_valve_t *valve)
+{
+       valve_conn_close(valve, 1);
+       valve_conn_open(valve);
+}
+
+void
+valve_conn_receive(td_valve_t *valve)
+{
+       unsigned long buf[32], cred = 0;
+       ssize_t n;
+       int i, err;
+
+       n = valve_sock_recv(valve, buf, sizeof(buf));
+       if (!n) {
+               err = -ECONNRESET;
+               goto reset;
+       }
+
+       if (n < 0) {
+               err = n;
+               if (err != -EAGAIN)
+                       goto reset;
+       }
+
+       for (i = 0; i < n / sizeof(buf[0]); i++) {
+               err = WARN_ON(buf[i] >= TD_RLB_REQUEST_MAX);
+               if (err)
+                       goto kill;
+
+               cred += buf[i];
+       }
+
+       if (cred > valve->need) {
+               err = -EINVAL;
+               goto reset;
+       }
+
+       valve->cred += cred;
+       valve->need -= cred;
+
+       return;
+
+reset:
+       VERR(err, "resetting connection");
+       valve_conn_reset(valve);
+       return;
+
+kill:
+       ERR("Killing valve.");
+       valve_kill(valve);
+}
+
+static void
+valve_conn_request(td_valve_t *valve, unsigned long size)
+{
+       struct td_valve_req _req;
+       int err;
+
+       _req.need    = size;
+       _req.done    = valve->done;
+
+       valve->need += size;
+       valve->done  = 0;
+
+       valve_clear_done_pending(valve);
+
+       err = valve_sock_send(valve, &_req, sizeof(_req));
+       if (!err)
+               return;
+
+       VERR(err, "resetting connection");
+       valve_conn_reset(valve);
+}
+
+static int
+valve_expend_request(td_valve_t *valve, const td_request_t treq)
+{
+       if (valve->flags & TD_VALVE_KILLED)
+               return 0;
+
+       if (valve->sock < 0)
+               return 0;
+
+       if (valve->cred < TREQ_SIZE(treq))
+               return -EAGAIN;
+
+       valve->cred -= TREQ_SIZE(treq);
+
+       return 0;
+}
+
+static void
+__valve_complete_treq(td_request_t treq, int error)
+{
+       td_valve_request_t *req = treq.cb_data;
+       td_valve_t *valve = req->valve;
+
+       BUG_ON(req->secs < treq.secs);
+       req->secs -= treq.secs;
+
+       valve->done += TREQ_SIZE(treq);
+       valve_set_done_pending(valve);
+
+       if (!req->secs) {
+               td_complete_request(req->treq, error);
+               valve_free_request(valve, req, &valve->forw);
+       }
+}
+
+static void
+valve_forward_stored_requests(td_valve_t *valve)
+{
+       td_valve_request_t *req, *next;
+       td_request_t clone;
+       int err;
+
+       td_valve_for_each_stored_request(req, next, valve) {
+
+               err = valve_expend_request(valve, req->treq);
+               if (err)
+                       break;
+
+               clone         = req->treq;
+               clone.cb      = __valve_complete_treq;
+               clone.cb_data = req;
+
+               td_forward_request(clone);
+               valve->stats.forw++;
+
+               //list_move(&req->entry, &valve->forw);
+        TAILQ_MOVE_HEAD(req, &valve->stor, &valve->forw, entry);
+       }
+}
+
+static int
+valve_store_request(td_valve_t *valve, td_request_t treq)
+{
+       td_valve_request_t *req;
+
+       req = valve_alloc_request(valve);
+       if (!req)
+               return -EBUSY;
+
+       valve_conn_request(valve, TREQ_SIZE(treq));
+
+       req->treq = treq;
+       req->secs = treq.secs;
+
+       //list_add_tail(&req->entry, &valve->stor);
+    TAILQ_INSERT_TAIL(&valve->stor, req, entry);
+       valve->stats.stor++;
+
+       return 0;
+}
+
+static void
+valve_kill(td_valve_t *valve)
+{
+       valve->flags |= TD_VALVE_KILLED;
+       valve_conn_close(valve, 1);
+}
+
+static void
+valve_init(td_valve_t *valve, unsigned long flags)
+{
+       int i;
+
+       memset(valve, 0, sizeof(*valve));
+
+       TAILQ_INIT(&valve->stor);
+       TAILQ_INIT(&valve->forw);
+
+       valve->sock     = -1;
+       valve->sock_id  = -1;
+
+       valve->retry_id = -1;
+       valve->sched_id = -1;
+
+       valve->flags    = flags;
+
+       for (i = ARRAY_SIZE(valve->reqv) - 1; i >= 0; i--) {
+               td_valve_request_t *req = &valve->reqv[i];
+
+               req->valve = valve;
+
+               valve_free_request(valve, req, NULL);
+       }
+}
+
+static int
+td_valve_close(td_driver_t *driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       td_valve_t *valve = driver->data;
+
+       WARN_ON(!TAILQ_EMPTY(&valve->stor));
+       WARN_ON(!TAILQ_EMPTY(&valve->forw));
+
+       valve_conn_close(valve, 0);
+
+       if (valve->brname) {
+               free(valve->brname);
+               valve->brname = NULL;
+       }
+
+       return 0;
+}
+
+static int
+td_valve_open(td_driver_t *driver,
+             const char *name, td_flag_t flags __attribute__((unused)))
+{
+       td_valve_t *valve = driver->data;
+       int err;
+
+       valve_init(valve, TD_VALVE_WRLIMIT);
+
+       valve->brname = strdup(name);
+       if (!valve->brname) {
+               err = -errno;
+               goto fail;
+       }
+
+       valve_conn_open(valve);
+
+       return 0;
+
+fail:
+       td_valve_close(driver, NULL);
+       return err;
+}
+
+static void
+td_valve_queue_request(td_driver_t *driver, td_request_t treq)
+{
+       td_valve_t *valve = driver->data;
+       int err;
+
+       switch (treq.op) {
+
+       case TD_OP_READ:
+               if (valve->flags & TD_VALVE_RDLIMIT)
+                       break;
+
+               goto forward;
+
+       case TD_OP_WRITE:
+               if (valve->flags & TD_VALVE_WRLIMIT)
+                       break;
+
+               goto forward;
+
+       default:
+               BUG();
+       }
+
+       err = valve_expend_request(valve, treq);
+       if (!err)
+               goto forward;
+
+       err = valve_store_request(valve, treq);
+       if (err)
+               td_complete_request(treq, -EBUSY);
+
+       return;
+
+forward:
+       td_forward_request(treq);
+       valve->stats.forw++;
+}
+
+static int
+td_valve_get_parent_id(td_driver_t *driver __attribute__((unused)),
+        td_disk_id_t *id __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static int
+td_valve_validate_parent(td_driver_t *driver __attribute__((unused)),
+                        td_driver_t *parent_driver __attribute__((unused)),
+             td_flag_t flags __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static void
+td_valve_stats(td_driver_t *driver, td_stats_t *st)
+{
+       td_valve_t *valve = driver->data;
+       td_valve_request_t *req, *next;
+       int n_reqs;
+
+       tapdisk_stats_field(st, "bridge", "d", valve->brname);
+       tapdisk_stats_field(st, "flags", "#x", valve->flags);
+
+       tapdisk_stats_field(st, "cred", "d", valve->cred);
+       tapdisk_stats_field(st, "need", "d", valve->need);
+       tapdisk_stats_field(st, "done", "d", valve->done);
+
+       /*
+        * stored is [ waiting, total-waits ]
+        */
+
+       n_reqs = 0;
+       td_valve_for_each_stored_request(req, next, valve)
+               n_reqs++;
+
+       tapdisk_stats_field(st, "stor", "[");
+       tapdisk_stats_val(st, "d", n_reqs);
+       tapdisk_stats_val(st, "llu", valve->stats.stor);
+       tapdisk_stats_leave(st, ']');
+
+       /*
+        * forwarded is [ in-flight, total-requests ]
+        */
+
+       n_reqs = 0;
+       td_valve_for_each_forwarded_request(req, next, valve)
+               n_reqs++;
+
+       tapdisk_stats_field(st, "forw", "[");
+       tapdisk_stats_val(st, "d", n_reqs);
+       tapdisk_stats_val(st, "llu", valve->stats.forw);
+       tapdisk_stats_leave(st, ']');
+}
+
+struct tap_disk tapdisk_valve = {
+       .disk_type                  = "tapdisk_valve",
+       .flags                      = 0,
+       .private_data_size          = sizeof(td_valve_t),
+       .td_open                    = td_valve_open,
+       .td_close                   = td_valve_close,
+       .td_queue_read              = td_valve_queue_request,
+       .td_queue_write             = td_valve_queue_request,
+       .td_get_parent_id           = td_valve_get_parent_id,
+       .td_validate_parent         = td_valve_validate_parent,
+       .td_stats                   = td_valve_stats,
+};
diff --git a/tools/blktap3/drivers/block-valve.h 
b/tools/blktap3/drivers/block-valve.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-valve.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2011, Citrix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAPDISK_VALVE_H_
+#define _TAPDISK_VALVE_H_
+
+#define TD_VALVE_SOCKDIR          "/var/run/blktap/ratelimit"
+#define TD_RLB_CONN_MAX           1024
+#define TD_RLB_REQUEST_MAX        (8 << 20)
+
+struct td_valve_req {
+       unsigned long need;
+       unsigned long done;
+};
+
+#endif /* _TAPDISK_VALVE_H_ */
diff --git a/tools/blktap3/drivers/block-vhd.c 
b/tools/blktap3/drivers/block-vhd.c
--- a/tools/blktap3/drivers/block-vhd.c
+++ b/tools/blktap3/drivers/block-vhd.c
@@ -763,7 +763,8 @@ vhd_log_close(struct vhd_state *s)
 }
 
 static int
-_vhd_close(td_driver_t *driver)
+_vhd_close(td_driver_t *driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
 {
        int err;
        struct vhd_state *s;
diff --git a/tools/blktap3/drivers/block-vindex.c 
b/tools/blktap3/drivers/block-vindex.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/block-vindex.c
@@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "tapdisk.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-server.h"
+#include "tapdisk-interface.h"
+
+#include "libvhd.h"
+#include "libvhd-index.h"
+
+#define DBG(_level, _f, _a...)       tlog_write(_level, _f, ##_a)
+#define ERR(_err, _f, _a...)         tlog_error(_err, _f, ##_a)
+#define WARN(_f, _a...)              tlog_write(TLOG_WARN, _f, ##_a)
+
+#define ASSERT(condition)                                      \
+       if (!(condition)) {                                     \
+               WARN("FAILED ASSERTION: '%s'\n", #condition);   \
+               td_panic();                                     \
+       }
+
+#define VHD_INDEX_FILE_POOL_SIZE     12
+#define VHD_INDEX_CACHE_SIZE         4
+#define VHD_INDEX_REQUESTS           (TAPDISK_DATA_REQUESTS + 
VHD_INDEX_CACHE_SIZE)
+
+#define VHD_INDEX_BLOCK_READ_PENDING 0x0001
+#define VHD_INDEX_BLOCK_VALID        0x0002
+
+#define VHD_INDEX_BAT_CLEAR          0
+#define VHD_INDEX_BIT_CLEAR          1
+#define VHD_INDEX_BIT_SET            2
+#define VHD_INDEX_CACHE_MISS         3
+#define VHD_INDEX_META_READ_PENDING  4
+
+typedef struct vhd_index             vhd_index_t;
+typedef struct vhd_index_block       vhd_index_block_t;
+typedef struct vhd_index_request     vhd_index_request_t;
+typedef struct vhd_index_file_ref    vhd_index_file_ref_t;
+
+TAILQ_HEAD(tqh_vhd_index_request, vhd_index_request);
+
+struct vhd_index_request {
+       off64_t                      off;
+       td_request_t                 treq;
+       vhd_index_t                 *index;
+       struct tiocb                 tiocb;
+       TAILQ_ENTRY(vhd_index_request) next;
+       vhd_index_file_ref_t        *file;
+};
+
+struct vhd_index_block {
+       uint64_t                     blk;
+       uint32_t                     seqno;
+       td_flag_t                    state;
+       vhdi_block_t                 vhdi_block;
+       int                          table_size;
+       struct tqh_vhd_index_request queue;
+       vhd_index_request_t          req;
+};
+
+struct vhd_index_file_ref {
+       int                          fd;
+       vhdi_file_id_t               fid;
+       uint32_t                     seqno;
+       uint32_t                     refcnt;
+};
+
+struct vhd_index {
+       char                        *name;
+
+       vhdi_bat_t                   bat;
+       vhdi_context_t               vhdi;
+       vhdi_file_table_t            files;
+
+       vhd_index_file_ref_t         fds[VHD_INDEX_FILE_POOL_SIZE];
+
+       vhd_index_block_t           *cache[VHD_INDEX_CACHE_SIZE];
+
+       int                          cache_free_cnt;
+       vhd_index_block_t           *cache_free_list[VHD_INDEX_CACHE_SIZE];
+       vhd_index_block_t            cache_list[VHD_INDEX_CACHE_SIZE];
+
+       int                          requests_free_cnt;
+       vhd_index_request_t         *requests_free_list[VHD_INDEX_REQUESTS];
+       vhd_index_request_t          requests_list[VHD_INDEX_REQUESTS];
+
+       td_driver_t                 *driver;
+};
+
+static void vhd_index_complete_meta_read(void *, struct tiocb *, int);
+static void vhd_index_complete_data_read(void *, struct tiocb *, int);
+
+//list_for_each_entry_safe((_req), (_tmp), &(_block)->queue, next)
+#define vhd_index_block_for_each_request(_block, _req, _tmp)           \
+    TAILQ_FOREACH_SAFE((_req), &(_block)->queue, next, (_tmp))
+
+static inline void
+vhd_index_initialize_request(vhd_index_request_t *req)
+{
+       memset(req, 0, sizeof(vhd_index_request_t));
+}
+
+static inline void
+vhd_index_initialize_block(vhd_index_block_t *block)
+{
+       block->blk   = 0;
+       block->state = 0;
+       TAILQ_INIT(&block->queue);
+       vhd_index_initialize_request(&block->req);
+       memset(block->vhdi_block.table, 0, block->table_size);
+}
+
+static void
+vhd_index_init(vhd_index_t *index)
+{
+       int i;
+
+       memset(index, 0, sizeof(vhd_index_t));
+
+       index->cache_free_cnt = VHD_INDEX_CACHE_SIZE;
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               index->cache_free_list[i] = index->cache_list + i;
+               vhd_index_initialize_block(index->cache_free_list[i]);
+       }
+
+       index->requests_free_cnt = VHD_INDEX_REQUESTS;
+       for (i = 0; i < VHD_INDEX_REQUESTS; i++) {
+               index->requests_free_list[i] = index->requests_list + i;
+               vhd_index_initialize_request(index->requests_free_list[i]);
+       }
+
+       for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++)
+               index->fds[i].fd = -1;
+}
+
+static int
+vhd_index_allocate_cache(vhd_index_t *index)
+{
+       void *buf;
+       int i, err;
+       size_t size;
+
+       size = vhd_bytes_padded(index->vhdi.spb * sizeof(vhdi_entry_t));
+
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               err = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+               if (err)
+                       goto fail;
+
+               memset(buf, 0, size);
+               index->cache_list[i].vhdi_block.table   = (vhdi_entry_t *)buf;
+               index->cache_list[i].vhdi_block.entries = index->vhdi.spb;
+               index->cache_list[i].table_size         = size;
+       }
+
+       return 0;
+
+fail:
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               free(index->cache_list[i].vhdi_block.table);
+               index->cache_list[i].vhdi_block.table = NULL;
+       }
+
+       return -ENOMEM;
+}
+
+static void
+vhd_index_free(vhd_index_t *index)
+{
+       int i;
+
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++)
+               free(index->cache_list[i].vhdi_block.table);
+
+       for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++)
+               if (index->fds[i].fd != -1)
+                       close(index->fds[i].fd);
+
+       vhdi_file_table_free(&index->files);
+       free(index->bat.table);
+       free(index->name);
+}
+
+static int
+vhd_index_load(vhd_index_t *index)
+{
+       int err;
+
+       err = vhdi_bat_load(index->name, &index->bat);
+       if (err)
+               return err;
+
+       err = vhdi_open(&index->vhdi,
+                       index->bat.index_path,
+                       O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (err)
+               goto fail;
+
+       err = vhdi_file_table_load(index->bat.file_table_path, &index->files);
+       if (err) {
+               vhdi_close(&index->vhdi);
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       free(index->bat.table);
+       memset(&index->bat, 0, sizeof(vhdi_bat_t));
+       memset(&index->vhdi, 0, sizeof(vhdi_context_t));
+       memset(&index->files, 0, sizeof(vhdi_file_table_t));
+       return err;
+}
+
+static int
+vhd_index_open(td_driver_t *driver, const char *name,
+        td_flag_t flags __attribute__((unused)))
+{
+       int err;
+       vhd_index_t *index;
+
+       index = (vhd_index_t *)driver->data;
+
+       vhd_index_init(index);
+
+       index->name = strdup(name);
+       if (!index->name)
+               return -ENOMEM;
+
+       err = vhd_index_load(index);
+       if (err) {
+               free(index->name);
+               return err;
+       }
+
+       err = vhd_index_allocate_cache(index);
+       if (err) {
+               vhd_index_free(index);
+               return err;
+       }
+
+       driver->info.size = index->bat.vhd_blocks * index->bat.vhd_block_size;
+       driver->info.sector_size = VHD_SECTOR_SIZE;
+       driver->info.info = 0;
+
+       index->driver = driver;
+
+       DPRINTF("opened vhd index %s\n", name);
+
+       return 0;
+}
+
+static int
+vhd_index_close(td_driver_t *driver,
+        struct tqh_td_image_handle *head __attribute__((unused)))
+{
+       vhd_index_t *index;
+
+       index = (vhd_index_t *)driver->data;
+       vhdi_close(&index->vhdi);
+
+       DPRINTF("closed vhd index %s\n", index->name);
+
+       vhd_index_free(index);
+
+       return 0;
+}
+
+static inline void
+vhd_index_touch_file_ref(vhd_index_t *index, vhd_index_file_ref_t *ref)
+{
+       int i;
+
+       if (++ref->seqno == 0xFFFFFFFF)
+               for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++)
+                       index->fds[i].seqno >>= 1;
+}
+
+static inline void
+vhd_index_get_file_ref(vhd_index_file_ref_t *ref)
+{
+       ++ref->refcnt;
+}
+
+static inline void
+vhd_index_put_file_ref(vhd_index_file_ref_t *ref)
+{
+       --ref->refcnt;
+}
+
+static inline vhd_index_file_ref_t *
+vhd_index_find_lru_file_ref(vhd_index_t *index)
+{
+       int i;
+       uint32_t min;
+       vhd_index_file_ref_t *lru;
+
+       lru = NULL;
+       min = (uint32_t)-1;
+
+       for (i = 1; i < VHD_INDEX_FILE_POOL_SIZE; i++) {
+               if (index->fds[i].refcnt)
+                       continue;
+
+               if (!lru || index->fds[i].seqno < min) {
+                       min = index->fds[i].seqno;
+                       lru = index->fds + i;
+               }
+       }
+
+       return lru;
+}
+
+static inline int
+vhd_index_open_file(vhd_index_t *index,
+                   vhdi_file_id_t id, vhd_index_file_ref_t *ref)
+{
+       int i;
+       char *path;
+
+       path = NULL;
+
+       for (i = 0; i < index->files.entries; i++)
+               if (index->files.table[i].file_id == id) {
+                       path = index->files.table[i].path;
+                       break;
+               }
+
+       if (!path)
+               return -ENOENT;
+
+       ref->fd = open(path, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (ref->fd == -1)
+               return -errno;
+
+       ref->fid    = id;
+       ref->refcnt = 0;
+
+       return 0;
+}
+
+static int
+vhd_index_get_file(vhd_index_t *index,
+                  vhdi_file_id_t id, vhd_index_file_ref_t **ref)
+{
+       int i, err;
+       vhd_index_file_ref_t *lru;
+
+       *ref = NULL;
+
+       for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++)
+               if (id == index->fds[i].fid) {
+                       *ref = index->fds + i;
+                       vhd_index_touch_file_ref(index, *ref);
+                       vhd_index_get_file_ref(*ref);
+                       return 0;
+               }
+
+       lru = vhd_index_find_lru_file_ref(index);
+       if (!lru)
+               return -EBUSY;
+
+       if (lru->fd != -1)
+               close(lru->fd);
+
+       err = vhd_index_open_file(index, id, lru);
+       if (err)
+               goto fail;
+
+       vhd_index_touch_file_ref(index, lru);
+       vhd_index_get_file_ref(lru);
+       *ref = lru;
+       return 0;
+
+fail:
+       lru->fd     = -1;
+       lru->fid    = 0;
+       lru->refcnt = 0;
+       return err;
+}
+
+static inline vhd_index_request_t *
+vhd_index_allocate_request(vhd_index_t *index)
+{
+       vhd_index_request_t *req;
+
+       if (index->requests_free_cnt <= 0)
+               return NULL;
+
+       req = index->requests_free_list[--index->requests_free_cnt];
+       ASSERT(!req->index);
+
+       return req;
+}
+
+static inline void
+vhd_index_free_request(vhd_index_t *index, vhd_index_request_t *req,
+        struct tqh_vhd_index_request *head)
+{
+    if (head)
+        TAILQ_REMOVE(head, req, next);
+       vhd_index_initialize_request(req);
+       index->requests_free_list[index->requests_free_cnt++] = req;
+}
+
+static inline int
+vhd_index_block_valid(vhd_index_block_t *block)
+{
+       return (!td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING) &&
+               td_flag_test(block->state, VHD_INDEX_BLOCK_VALID));
+}
+
+static inline void
+vhd_index_touch_block(vhd_index_t *index, vhd_index_block_t *block)
+{
+       int i;
+
+       if (++block->seqno == 0xFFFFFFFF)
+               for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++)
+                       index->cache_list[i].seqno >>= 1;
+}
+
+static inline vhd_index_block_t *
+vhd_index_get_lru_block(vhd_index_t *index)
+{
+       int i, idx;
+       uint32_t min;
+       vhd_index_block_t *block, *lru;
+
+       lru = NULL;
+       min = (uint32_t)-1;
+       idx = 0;
+
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               block = index->cache[i];
+
+               if (!block)
+                       continue;
+
+               if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING))
+                       continue;
+
+               if (!lru || block->seqno < min) {
+                       lru = block;
+                       min = block->seqno;
+                       idx = i;
+               }
+       }
+
+       if (lru)
+               index->cache[idx] = NULL;
+
+       return lru;
+}
+
+static inline int
+vhd_index_allocate_block(vhd_index_t *index, vhd_index_block_t **block)
+{
+       vhd_index_block_t *b;
+
+       *block = NULL;
+
+       if (index->cache_free_cnt > 0)
+               b = index->cache_free_list[--index->cache_free_cnt];
+       else {
+               b = vhd_index_get_lru_block(index);
+               if (!b)
+                       return -EBUSY;
+       }
+
+       vhd_index_initialize_block(b);
+       vhd_index_touch_block(index, b);
+       *block = b;
+
+       return 0;
+}
+
+static int
+vhd_index_install_block(vhd_index_t *index,
+                       vhd_index_block_t **block, uint32_t blk)
+{
+       int i, err;
+       vhd_index_block_t *b;
+
+       *block = NULL;
+
+       err = vhd_index_allocate_block(index, &b);
+       if (err)
+               return err;
+
+       b->blk = blk;
+
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++)
+               if (!index->cache[i]) {
+                       index->cache[i] = b;
+                       break;
+               }
+
+       ASSERT(i < VHD_INDEX_CACHE_SIZE);
+       *block = b;
+
+       return 0;
+}
+
+static inline vhd_index_block_t *
+vhd_index_get_block(vhd_index_t *index, uint32_t blk)
+{
+       int i;
+       vhd_index_block_t *block;
+
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               block = index->cache[i];
+               if (!block)
+                       continue;
+
+               if (block->blk == blk)
+                       return block;
+       }
+
+       return NULL;
+}
+
+static int
+vhd_index_read_cache(vhd_index_t *index, uint64_t sector)
+{
+       uint32_t blk, sec;
+       vhd_index_block_t *block;
+
+       blk = sector / index->vhdi.spb;
+
+       if (blk >= index->bat.vhd_blocks)
+               return -EINVAL;
+
+       if (index->bat.table[blk] == DD_BLK_UNUSED)
+               return VHD_INDEX_BAT_CLEAR;
+
+       block = vhd_index_get_block(index, blk);
+       if (!block)
+               return VHD_INDEX_CACHE_MISS;
+
+       vhd_index_touch_block(index, block);
+
+       if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING))
+               return VHD_INDEX_META_READ_PENDING;
+
+       sec = sector % index->vhdi.spb;
+       if (block->vhdi_block.table[sec].offset == DD_BLK_UNUSED)
+               return VHD_INDEX_BIT_CLEAR;
+
+       return VHD_INDEX_BIT_SET;
+}
+
+static int
+vhd_index_read_cache_span(vhd_index_t *index,
+                         uint64_t sector, int secs, int value)
+{
+       int i;
+       uint32_t blk, sec;
+       vhd_index_block_t *block;
+
+       blk = sector / index->vhdi.spb;
+       sec = sector % index->vhdi.spb;
+
+       ASSERT(blk < index->bat.vhd_blocks);
+
+       block = vhd_index_get_block(index, blk);
+       ASSERT(block && vhd_index_block_valid(block));
+
+       for (i = 0; i < secs && i + sec < index->vhdi.spb; i++)
+               if (value ^
+                   (block->vhdi_block.table[sec + i].offset != DD_BLK_UNUSED))
+                       break;
+
+       return i;
+}
+
+static int
+vhd_index_schedule_meta_read(vhd_index_t *index, uint32_t blk)
+{
+       int err;
+       off64_t offset;
+       vhd_index_block_t *block;
+       vhd_index_request_t *req;
+
+       ASSERT(index->bat.table[blk] != DD_BLK_UNUSED);
+
+       block = vhd_index_get_block(index, blk);
+       if (!block) {
+               err = vhd_index_install_block(index, &block, blk);
+               if (err)
+                       return err;
+       }
+
+       offset         = vhd_sectors_to_bytes(index->bat.table[blk]);
+
+       req            = &block->req;
+       req->index     = index;
+       req->treq.sec  = blk * index->vhdi.spb;
+       req->treq.secs = block->table_size >> VHD_SECTOR_SHIFT;
+
+       td_prep_read(&req->tiocb, index->vhdi.fd,
+                    (char *)block->vhdi_block.table, block->table_size,
+                    offset, vhd_index_complete_meta_read, req);
+       td_queue_tiocb(index->driver, &req->tiocb);
+
+       td_flag_set(block->state, VHD_INDEX_BLOCK_READ_PENDING);
+
+       return 0;
+}
+
+static int
+vhd_index_schedule_data_read(vhd_index_t *index, td_request_t treq)
+{
+       int i, err;
+       size_t size;
+       off64_t offset;
+       uint32_t blk, sec;
+       vhd_index_block_t *block;
+       vhd_index_request_t *req;
+       vhd_index_file_ref_t *file;
+
+       blk   = treq.sec / index->vhdi.spb;
+       sec   = treq.sec % index->vhdi.spb;
+       block = vhd_index_get_block(index, blk);
+
+       ASSERT(block && vhd_index_block_valid(block));
+       for (i = 0; i < treq.secs; i++) {
+               ASSERT(block->vhdi_block.table[sec + i].file_id != 0);
+               ASSERT(block->vhdi_block.table[sec + i].offset != 
DD_BLK_UNUSED);
+       }
+
+       req = vhd_index_allocate_request(index);
+       if (!req)
+               return -EBUSY;
+
+       err = vhd_index_get_file(index,
+                                block->vhdi_block.table[sec].file_id, &file);
+       if (err) {
+               vhd_index_free_request(index, req, NULL);
+               return err;
+       }
+
+       size       = vhd_sectors_to_bytes(treq.secs);
+       offset     = vhd_sectors_to_bytes(block->vhdi_block.table[sec].offset);
+
+       req->file  = file;
+       req->treq  = treq;
+       req->index = index;
+       req->off   = offset;
+
+       td_prep_read(&req->tiocb, file->fd, treq.buf, size, offset,
+                    vhd_index_complete_data_read, req);
+       td_queue_tiocb(index->driver, &req->tiocb);
+
+       return 0;
+}
+
+static int
+vhd_index_queue_request(vhd_index_t *index, td_request_t treq)
+{
+       vhd_index_block_t *block;
+       vhd_index_request_t *req;
+
+       req = vhd_index_allocate_request(index);
+       if (!req)
+               return -EBUSY;
+
+       req->treq = treq;
+
+       block = vhd_index_get_block(index, treq.sec / index->vhdi.spb);
+       ASSERT(block && td_flag_test(block->state, 
VHD_INDEX_BLOCK_READ_PENDING));
+
+       TAILQ_INSERT_TAIL(&block->queue, req, next);
+       return 0;
+}
+
+static void
+vhd_index_queue_read(td_driver_t *driver, td_request_t treq)
+{
+       vhd_index_t *index;
+
+       index = (vhd_index_t *)driver->data;
+
+       while (treq.secs) {
+               int err;
+               td_request_t clone;
+
+               err   = 0;
+               clone = treq;
+
+               switch (vhd_index_read_cache(index, clone.sec)) {
+               case -EINVAL:
+                       err = -EINVAL;
+                       goto fail;
+
+               case VHD_INDEX_BAT_CLEAR:
+                       clone.secs = MIN(clone.secs, index->vhdi.spb - 
(clone.sec % index->vhdi.spb));
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_INDEX_BIT_CLEAR:
+                       clone.secs = vhd_index_read_cache_span(index, 
clone.sec, clone.secs, 0);
+                       td_forward_request(clone);
+                       break;
+
+               case VHD_INDEX_BIT_SET:
+                       clone.secs = vhd_index_read_cache_span(index, 
clone.sec, clone.secs, 1);
+                       err = vhd_index_schedule_data_read(index, clone);
+                       if (err)
+                               goto fail;
+                       break;
+
+               case VHD_INDEX_CACHE_MISS:
+                       err = vhd_index_schedule_meta_read(index, clone.sec / 
index->vhdi.spb);
+                       if (err)
+                               goto fail;
+
+                       clone.secs = MIN(clone.secs, index->vhdi.spb - 
(clone.sec % index->vhdi.spb));
+                       vhd_index_queue_request(index, clone);
+                       break;
+
+               case VHD_INDEX_META_READ_PENDING:
+                       clone.secs = MIN(clone.secs, index->vhdi.spb - 
(clone.sec % index->vhdi.spb));
+                       err = vhd_index_queue_request(index, clone);
+                       if (err)
+                               goto fail;
+                       break;
+               }
+
+               treq.sec  += clone.secs;
+               treq.secs -= clone.secs;
+               treq.buf  += vhd_sectors_to_bytes(clone.secs);
+               continue;
+
+       fail:
+               clone.secs = treq.secs;
+               td_complete_request(clone, err);
+               break;
+       }
+}
+
+static void
+vhd_index_queue_write(td_driver_t *driver __attribute__((unused)),
+        td_request_t treq)
+{
+       td_complete_request(treq, -EPERM);
+}
+
+static inline void
+vhd_index_signal_completion(vhd_index_t *index,
+                           vhd_index_request_t *req, int err,
+                struct tqh_vhd_index_request *head)
+{
+       td_complete_request(req->treq, err);
+       vhd_index_put_file_ref(req->file);
+       vhd_index_free_request(index, req, head);
+}
+
+static void
+vhd_index_complete_meta_read(void *arg,
+        struct tiocb *tiocb __attribute__((unused)), int err)
+{
+       int i;
+       uint32_t blk;
+       td_request_t treq;
+       vhd_index_t *index;
+       vhd_index_block_t *block;
+       vhd_index_request_t *req, *r, *tmp;
+
+       req   = (vhd_index_request_t *)arg;
+       index = req->index;
+
+       blk   = req->treq.sec / index->vhdi.spb;
+       block = vhd_index_get_block(index, blk);
+       ASSERT(block && td_flag_test(block->state, 
VHD_INDEX_BLOCK_READ_PENDING));
+       td_flag_clear(block->state, VHD_INDEX_BLOCK_READ_PENDING);
+
+       if (err) {
+               memset(block->vhdi_block.table, 0, block->table_size);
+               vhd_index_block_for_each_request(block, r, tmp)
+                       vhd_index_signal_completion(index, r, err, 
&block->queue);
+               return;
+       }
+
+       for (i = 0; i < block->vhdi_block.entries; i++)
+               vhdi_entry_in(block->vhdi_block.table + i);
+
+       td_flag_set(block->state, VHD_INDEX_BLOCK_VALID);
+
+       vhd_index_block_for_each_request(block, r, tmp) {
+               treq = r->treq;
+               vhd_index_free_request(index, r, &block->queue);
+               vhd_index_queue_read(index->driver, treq);
+       }
+}
+
+static void
+vhd_index_complete_data_read(void *arg,
+        struct tiocb *tiocb __attribute__((unused)), int err)
+{
+       vhd_index_t *index;
+       vhd_index_request_t *req;
+
+       req   = (vhd_index_request_t *)arg;
+       index = req->index;
+
+    /* FIXME Can't tell for sure if req belongs to the queue */
+       vhd_index_signal_completion(index, req, err, NULL);
+}
+
+static int
+vhd_index_get_parent_id(td_driver_t *driver __attribute__((unused)),
+        td_disk_id_t *id __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static int
+vhd_index_validate_parent(td_driver_t *driver __attribute__((unused)),
+                         td_driver_t *parent __attribute__((unused)),
+              td_flag_t flags __attribute__((unused)))
+{
+       return -EINVAL;
+}
+
+static void
+vhd_index_debug(td_driver_t *driver)
+{
+       int i;
+       vhd_index_t *index;
+
+       index = (vhd_index_t *)driver->data;
+
+       WARN("VHD INDEX %s\n", index->name);
+       WARN("FILES:\n");
+       for (i = 0; i < index->files.entries; i++) {
+               int j, fd, refcnt;
+
+               fd     = -1;
+               refcnt = 0;
+
+               for (j = 0; j < VHD_INDEX_FILE_POOL_SIZE; j++)
+                       if (index->fds[j].fid == index->files.table[i].file_id) 
{
+                               fd     = index->fds[j].fd;
+                               refcnt = index->fds[j].refcnt;
+                       }
+
+               WARN("%s %u %d %d\n",
+                    index->files.table[i].path,
+                    index->files.table[i].file_id,
+                    fd, refcnt);
+       }
+
+       WARN("REQUESTS:\n");
+       for (i = 0; i < VHD_INDEX_REQUESTS; i++) {
+               vhd_index_request_t *req;
+
+               req = index->requests_list + i;
+
+               if (!req->index)
+                       continue;
+
+               WARN("%d: buf: %p, sec: 0x%08"PRIx64", secs: 0x%04x, "
+                    "fid: %u, off: 0x%016"PRIx64"\n", i, req->treq.buf,
+                    req->treq.sec, req->treq.secs, req->file->fid, req->off);
+       }
+
+       WARN("BLOCKS:\n");
+       for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) {
+               int queued;
+               vhd_index_block_t *block;
+               vhd_index_request_t *req, *tmp;
+
+               queued = 0;
+               block  = index->cache[i];
+
+               if (!block)
+                       continue;
+
+               vhd_index_block_for_each_request(block, req, tmp)
+                       ++queued;
+
+               WARN("%d: blk: 0x%08"PRIx64", state: 0x%08x, queued: %d\n",
+                    i, block->blk, block->state, queued);
+       }
+}
+
+struct tap_disk tapdisk_vhd_index = {
+       .disk_type                = "tapdisk_vhd_index",
+       .flags                    = 0,
+       .private_data_size        = sizeof(vhd_index_t),
+       .td_open                  = vhd_index_open,
+       .td_close                 = vhd_index_close,
+       .td_queue_read            = vhd_index_queue_read,
+       .td_queue_write           = vhd_index_queue_write,
+       .td_get_parent_id         = vhd_index_get_parent_id,
+       .td_validate_parent       = vhd_index_validate_parent,
+       .td_debug                 = vhd_index_debug,
+};
diff --git a/tools/blktap3/drivers/io-optimize.h 
b/tools/blktap3/drivers/io-optimize.h
--- a/tools/blktap3/drivers/io-optimize.h
+++ b/tools/blktap3/drivers/io-optimize.h
@@ -2,28 +2,7 @@
  * Copyright (c) 2008, XenSource Inc.
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of XenSource Inc. nor the names of its contributors
- *       may be used to endorse or promote products derived from this software
- *       without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * FIXME License missing from blktap2.5.
 */
 
 #ifndef __IO_OPTIMIZE_H__
diff --git a/tools/blktap3/drivers/profile.h b/tools/blktap3/drivers/profile.h
--- a/tools/blktap3/drivers/profile.h
+++ b/tools/blktap3/drivers/profile.h
@@ -2,28 +2,7 @@
  * Copyright (c) 2008, XenSource Inc.
  * All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of XenSource Inc. nor the names of its contributors
- *       may be used to endorse or promote products derived from this software
- *       without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * FIXME blktap2.5 has no license
 */
 
 #ifndef __TAP_PROFILE_H__
diff --git a/tools/blktap3/drivers/tapdisk-blktap.h 
b/tools/blktap3/drivers/tapdisk-blktap.h
deleted file mode 100644
--- a/tools/blktap3/drivers/tapdisk-blktap.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2010, Citrix Systems, Inc.
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of XenSource Inc. nor the names of its contributors
- *       may be used to endorse or promote products derived from this software
- *       without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
- * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _TAPDISK_BLKTAP_H_
-#define _TAPDISK_BLKTAP_H_
-
-typedef struct td_blktap td_blktap_t;
-typedef struct td_blktap_req td_blktap_req_t;
-
-#include "blktap3.h"
-#include "tapdisk-vbd.h"
-
-#if 0
-struct td_blktap_stats {
-    struct {
-        unsigned long long in;
-        unsigned long long out;
-    } reqs;
-    struct {
-        unsigned long long in;
-        unsigned long long out;
-    } kicks;
-};
-#endif
-
-struct td_blktap {
-    int minor;
-    //td_vbd_t *vbd;
-
-#if 0
-    int fd;
-#endif
-
-#if 0
-    void *vma;
-    size_t vma_size;
-
-    struct blktap_sring *sring;
-    unsigned int req_cons;
-    unsigned int rsp_prod_pvt;
-#endif
-
-#if 0
-    int event_id;
-    void *vstart;
-
-    int n_reqs;
-    td_blktap_req_t *reqs;
-    int n_reqs_free;
-    td_blktap_req_t **reqs_free;
-#endif
-
-    //TAILQ_ENTRY(td_blktap) entry;
-
-    //struct td_blktap_stats stats;
-};
-
-#endif /* _TAPDISK_BLKTAP_H_ */
diff --git a/tools/blktap3/drivers/tapdisk-control.c 
b/tools/blktap3/drivers/tapdisk-control.c
--- a/tools/blktap3/drivers/tapdisk-control.c
+++ b/tools/blktap3/drivers/tapdisk-control.c
@@ -49,7 +49,7 @@
 #include "tapdisk-disktype.h"
 #include "tapdisk-stats.h"
 #include "tapdisk-control.h"
-#include "sring/td-blkif.h"
+#include "tapdisk-nbdserver.h"
 
 #define TD_CTL_MAX_CONNECTIONS  10
 #define TD_CTL_SOCK_BACKLOG     32
@@ -57,10 +57,11 @@
 #define TD_CTL_SEND_TIMEOUT     10
 #define TD_CTL_SEND_BUFSZ       ((size_t)4096)
 
-#define DBG(_f, _a...)             tlog_syslog(LOG_DEBUG, "%s:%d " _f, \
+#define DBG(_f, _a...)             tlog_syslog(TLOG_DBG, "%s:%d " _f, \
         __FILE__, __LINE__, ##_a)
 #define ERR(err, _f, _a...)        tlog_error(err, "%s:%d " _f, __FILE__, \
     __LINE__, ##_a)
+#define INFO(_f, _a...)            tlog_syslog(TLOG_INFO, "control: " _f, ##_a)
 
 #define ASSERT(_p)                                                     \
        if (!(_p)) {                                                    \
@@ -96,9 +97,8 @@ struct tapdisk_ctl_conn {
     struct tapdisk_control_info *info;
 };
 
-#define TAPDISK_MSG_REENTER    (1<<0)   /* non-blocking, idempotent */
-#define TAPDISK_MSG_VERBOSE    (1<<1)   /* tell syslog about it */
-#define TAPDISK_MSG_VERBOSE_ERROR (1<<2)    /* tell syslog about it, with 
errors */
+#define TAPDISK_MSG_REENTER    (1<<0) /* non-blocking, idempotent */
+#define TAPDISK_MSG_VERBOSE    (1<<1) /* tell syslog about it */
 
 struct tapdisk_control_info {
     int (*handler) (struct tapdisk_ctl_conn *, tapdisk_message_t *,
@@ -128,7 +128,7 @@ static inline size_t page_align(size_t s
 static void tapdisk_ctl_conn_uninit(struct tapdisk_ctl_conn *conn)
 {
     if (conn->out.buf) {
-        munmap(conn->out.buf, conn->out.bufsz);
+        free(conn->out.buf);
         conn->out.buf = NULL;
     }
 }
@@ -136,22 +136,18 @@ static void tapdisk_ctl_conn_uninit(stru
 static int
 tapdisk_ctl_conn_init(struct tapdisk_ctl_conn *conn, size_t bufsz)
 {
-    int prot, flags, err;
+       int err;
 
     memset(conn, 0, sizeof(*conn));
     conn->out.event_id = -1;
     conn->in.event_id = -1;
 
-    prot = PROT_READ | PROT_WRITE;
-    flags = MAP_ANONYMOUS | MAP_PRIVATE;
-
-    conn->out.buf = mmap(NULL, bufsz, prot, flags, -1, 0);
-    if (conn->out.buf == MAP_FAILED) {
-        conn->out.buf = NULL;
-        err = -ENOMEM;
-        goto fail;
-    }
-    conn->out.bufsz = page_align(bufsz);
+       conn->out.buf = malloc(bufsz);
+       if (!conn->out.buf) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       conn->out.bufsz = page_align(bufsz);
 
     return 0;
 
@@ -264,8 +260,15 @@ static void tapdisk_ctl_conn_drain(struc
     fd_set wfds;
     int n, mode;
 
-    ASSERT(conn->out.done);
-    ASSERT(conn->fd >= 0);
+       if (!conn->out.done) {
+               /* we accepted this connection but haven't received the message 
+                * body yet. Since this tapdisk is on its way out, just drop 
+                * the connection. */
+               tapdisk_ctl_conn_close(conn);
+               return;
+       }
+
+       ASSERT(conn->fd >= 0);
 
     while (tapdisk_ctl_conn_connected(conn)) {
         FD_ZERO(&wfds);
@@ -284,7 +287,6 @@ static void tapdisk_ctl_conn_drain(struc
     }
 }
 
-
 struct tapdisk_ctl_conn *tapdisk_ctl_conn_open(int fd)
 {
     struct tapdisk_ctl_conn *conn;
@@ -301,9 +303,10 @@ struct tapdisk_ctl_conn *tapdisk_ctl_con
     if (conn->out.event_id < 0)
         return NULL;
 
-    conn->fd = fd;
-    conn->out.prod = conn->out.buf;
-    conn->out.cons = conn->out.buf;
+       conn->fd       = fd;
+       conn->out.prod = conn->out.buf;
+       conn->out.cons = conn->out.buf;
+       conn->out.done = 0;
 
     tapdisk_ctl_conn_mask_out(conn);
 
@@ -471,7 +474,7 @@ tapdisk_control_write_message(struct tap
 {
     size_t size = sizeof(*message), count;
 
-    if (conn->info->flags & TAPDISK_MSG_VERBOSE)
+       if (conn->info && conn->info->flags & TAPDISK_MSG_VERBOSE)
         DBG("sending '%s' message\n", tapdisk_message_name(message->type));
 
     count = tapdisk_ctl_conn_write(conn, message, size);
@@ -612,9 +615,6 @@ tapdisk_control_open_image(
                goto out;
        }
 
-    /* TODO Add after everything has been initialised? */
-       tapdisk_server_add_vbd(vbd);
-
     /* TODO check for unsupported flags */
        flags = 0;
        if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY)
@@ -655,6 +655,18 @@ tapdisk_control_open_image(
                goto fail_close;
        }
 
+       /*
+        * For now, let's do this automatically on all 'open' calls In the 
+        * future, we'll probably want a separate call to start the NBD server
+        */
+       err = tapdisk_vbd_start_nbdserver(vbd);
+       if (err) {
+               EPRINTF("failed to start nbdserver: %d\n",err);
+               goto fail_close;
+       }
+
+       tapdisk_server_add_vbd(vbd);
+
        err = 0;
 
 out:
@@ -663,9 +675,7 @@ out:
         response->u.image.sector_size = info.sector_size;
         response->u.image.info = info.info;
         response->type = TAPDISK_MESSAGE_OPEN_RSP;
-    } else
-        if (vbd)
-            tapdisk_server_remove_vbd(vbd);
+    }
 
        return err;
 
@@ -713,9 +723,22 @@ tapdisk_control_close_image(struct tapdi
      * I assume we have disconnected from the ring before? If yes, then
      * make sure we check this. */
 
+       if (td_flag_test(vbd->state, TD_VBD_PAUSED))
+               EPRINTF("warning: closing paused VBD %s", vbd->name);
+
+       if(vbd->nbdserver) {
+         tapdisk_nbdserver_pause(vbd->nbdserver);
+       }
+
+       /* FIXME This was executed if tapdisk_blktap_remove_device returned 
ENOTTY */
     while (!TAILQ_EMPTY(&vbd->pending_requests))
         tapdisk_server_iterate();
 
+       if (vbd->nbdserver) {
+               tapdisk_nbdserver_free(vbd->nbdserver);
+               vbd->nbdserver = NULL;
+       }
+
        tapdisk_vbd_close_vdi(vbd);
 
        /* NB. vbd->name free should probably belong into close_vdi,
@@ -725,6 +748,7 @@ tapdisk_control_close_image(struct tapdi
        vbd->name = NULL;
 
     tapdisk_server_remove_vbd(vbd);
+       /* FIXME free vbd? */
 
 out:
     if (!err) {
@@ -747,24 +771,23 @@ tapdisk_control_pause_vbd(struct tapdisk
     assert(request);
     assert(response);
 
-    len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH);
+    len = strnlen(request->u.params.path, TAPDISK_MESSAGE_MAX_PATH_LENGTH);
 
     /* TODO boilerplate */
     if (len < 1) {
         err = -EINVAL;
         goto out;
     }
-    if (len >= TAPDISK_MESSAGE_STRING_LENGTH) {
+    if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
         err = -ENAMETOOLONG;
         goto out;
     }
 
        response->type = TAPDISK_MESSAGE_PAUSE_RSP;
 
-    /* TODO Need to fix this in control/tap-ctl-pause.c */
-       vbd = tapdisk_server_get_vbd(request->u.string.text);
+       vbd = tapdisk_server_get_vbd(request->u.params.path);
        if (!vbd) {
-               err = -EINVAL;
+               err = -ENODEV;
                goto out;
        }
 
@@ -798,30 +821,44 @@ tapdisk_control_resume_vbd(
     assert(request);
     assert(response);
 
-    len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH);
+    len = strnlen(request->u.resume.params1, TAPDISK_MESSAGE_MAX_PATH_LENGTH);
 
     /* TODO boilerplate */
     if (len < 1) {
         err = -EINVAL;
         goto out;
     }
-    if (len >= TAPDISK_MESSAGE_STRING_LENGTH) {
+    if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
         err = -ENAMETOOLONG;
         goto out;
     }
 
+    /* TODO validate secondary */
+
        response->type = TAPDISK_MESSAGE_RESUME_RSP;
 
-    /* TODO Need to fix this in control/tap-ctl-pause.c */
-       vbd = tapdisk_server_get_vbd(request->u.string.text);
+       INFO("Resuming: flags=0x%08x secondary=%p\n",
+                       request->u.resume.flags, request->u.resume.secondary);
+
+       vbd = tapdisk_server_get_vbd(request->u.resume.params1);
        if (!vbd) {
-               err = -EINVAL;
+               err = -ENODEV;
                goto out;
        }
 
-    /* TODO What's this path? */
-    if (request->u.params.path[0])
-        desc = request->u.params.path;
+       if (request->u.resume.flags & TAPDISK_MESSAGE_FLAG_SECONDARY) {
+               char *name = strdup(request->u.resume.secondary);
+               if (!name) {
+                       err = -errno;
+                       goto out;
+               }
+               INFO("Resuming with secondary '%s'\n", name);
+               vbd->secondary_name = name;
+               vbd->flags |= TD_OPEN_SECONDARY;
+       }
+
+    if (request->u.resume.params2[0])
+        desc = request->u.resume.params2;
 
     err = tapdisk_vbd_resume(vbd, desc);
 out:
@@ -837,27 +874,33 @@ tapdisk_control_stats(struct tapdisk_ctl
 {
     td_stats_t _st, *st = &_st;
     td_vbd_t *vbd;
-    size_t rv = 0;
-    int err = 0;
-    int len;
+    size_t rv;
+       void *buf;
+       int new_size;
+    size_t len;
 
     assert(request);
     assert(response);
 
-    len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH);
+       buf = malloc(TD_CTL_SEND_BUFSZ);
+       if (!buf) {
+               rv = -ENOMEM;
+               goto out;
+       }
 
-    tapdisk_stats_init(st,
-                       conn->out.buf + sizeof(*response),
-                       conn->out.bufsz - sizeof(*response));
+    len = strnlen(request->u.params.path, TAPDISK_MESSAGE_MAX_PATH_LENGTH);
+
+       tapdisk_stats_init(st, buf, TD_CTL_SEND_BUFSZ);
+
     if (len > 1) {
-        if (len >= TAPDISK_MESSAGE_STRING_LENGTH) {
-            err = -ENAMETOOLONG;
+        if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) {
+            rv = -ENAMETOOLONG;
             goto out;
         }
 
-        vbd = tapdisk_server_get_vbd(request->u.string.text);
+        vbd = tapdisk_server_get_vbd(request->u.params.path);
         if (!vbd) {
-            err = -ENODEV;
+            rv = -ENODEV;
                        goto out;
                }
 
@@ -875,8 +918,27 @@ tapdisk_control_stats(struct tapdisk_ctl
        }
 
     rv = tapdisk_stats_length(st);
+
+       if (rv > conn->out.bufsz - sizeof(response)) {
+               ASSERT(conn->out.prod == conn->out.buf);
+               ASSERT(conn->out.cons == conn->out.buf);
+               new_size = rv + sizeof(response);
+               buf = realloc(conn->out.buf, new_size);
+               if (!buf) {
+                       rv = -ENOMEM;
+                       goto out;
+               }
+               conn->out.buf = buf;
+               conn->out.bufsz = new_size;
+               conn->out.prod = buf;
+               conn->out.cons = buf;
+       }
+       if (rv > 0) {
+               memcpy(conn->out.buf + sizeof(response), st->buf, rv);
+       }
 out:
-    if (!err) {
+       free(st->buf);
+    if (!rv) {
         response->type = TAPDISK_MESSAGE_STATS_RSP;
         response->u.info.length = rv;
     }
@@ -885,7 +947,7 @@ out:
     if (rv > 0)
         conn->out.prod += rv;
 
-    return err;
+    return rv;
 }
 
 /**
@@ -1063,21 +1125,16 @@ struct tapdisk_control_info message_info
     [TAPDISK_MESSAGE_XENBLKIF_CONNECT] = {
                                           .handler =
                                           tapdisk_control_xenblkif_connect,
-                                          .flags =
-                                          TAPDISK_MSG_VERBOSE |
-                                          TAPDISK_MSG_VERBOSE_ERROR,
+                                          .flags = TAPDISK_MSG_VERBOSE
                                           },
     [TAPDISK_MESSAGE_XENBLKIF_DISCONNECT] = {
                                              .handler =
                                              
tapdisk_control_xenblkif_disconnect,
                                              .flags = TAPDISK_MSG_VERBOSE
-                                             || TAPDISK_MSG_VERBOSE_ERROR,
                                              },
     [TAPDISK_MESSAGE_DISK_INFO] = {
                                    .handler = tapdisk_control_disk_info,
-                                   .flags =
-                                   TAPDISK_MSG_VERBOSE |
-                                   TAPDISK_MSG_VERBOSE_ERROR,
+                                   .flags = TAPDISK_MSG_VERBOSE
                                    },
 };
 
@@ -1089,15 +1146,13 @@ static void tapdisk_control_handle_reque
     int err, excl;
     tapdisk_message_t message, response;
     struct tapdisk_ctl_conn *conn = private;
-    struct tapdisk_control_info *info;
+
+       conn->info = NULL;
 
     err = tapdisk_control_read_message(conn->fd, &message, 2);
     if (err)
         goto close;
 
-    if (conn->in.busy)
-        goto busy;
-
        err = tapdisk_control_validate_request(&message);
        if (err)
         goto invalid;
@@ -1105,16 +1160,19 @@ static void tapdisk_control_handle_reque
     if (message.type > TAPDISK_MESSAGE_EXIT)
         goto invalid;
 
-    info = &message_infos[message.type];
+    conn->info = &message_infos[message.type];
 
-    if (!info->handler)
+    if (!conn->info->handler)
         goto invalid;
 
-    if (info->flags & TAPDISK_MSG_VERBOSE)
+    if (conn->info->flags & TAPDISK_MSG_VERBOSE)
         DBG("received '%s' message\n",
             tapdisk_message_name(message.type));
 
-    excl = !(info->flags & TAPDISK_MSG_REENTER);
+  if (conn->in.busy)
+        goto busy;
+
+    excl = !(conn->info->flags & TAPDISK_MSG_REENTER);
     if (excl) {
         if (td_control.busy)
             goto busy;
@@ -1122,11 +1180,10 @@ static void tapdisk_control_handle_reque
         td_control.busy = 1;
     }
     conn->in.busy = 1;
-    conn->info = info;
 
        memset(&response, 0, sizeof(response));
 
-    err = info->handler(conn, &message, &response);
+    err = conn->info->handler(conn, &message, &response);
     if (err) {
         response.type = TAPDISK_MESSAGE_ERROR;
         response.u.response.error = -err;
@@ -1163,7 +1220,8 @@ static void tapdisk_control_handle_reque
     goto error;
 }
 
-static void tapdisk_control_accept(event_id_t id __attribute__((unused)),
+static void
+tapdisk_control_accept(event_id_t id __attribute__((unused)),
         char mode __attribute__((unused)),
         void *private __attribute__((unused)))
 {
diff --git a/tools/blktap3/drivers/tapdisk-diff.c 
b/tools/blktap3/drivers/tapdisk-diff.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-diff.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2009, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk-vbd.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+#include "libvhd.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT)
+
+/* 
+ * we have to use half the max number of requests because we're using the same 
+ * tapdisk server for both streams and all the parents will be shared. If we 
+ * issue more than MAX_REQUESTS/2 requests, the vhd_state will run out of 
+ * vhd_request's and return EBUSY, which we don't handle here. However, even 
+ * with MAX_REQUESTS/2 we can still run out of vhd_request's because of 
+ * splitting: if some sectors spanned by a segment are in a parent, a segment 
+ * could be split into at most N/2 vhd_request's, where N is the number of 
+ * sectors per segment. Therefore, if we use 11 segments, we need to divide 
+ * MAX_REQUESTS by 11/2=6 on top of that. If we don't, we'd have to handle 
+ * EBUSY by retrying here.
+ */
+#define MAX_SEGMENTS 8
+#define MAX_STREAM_REQUESTS (MAX_REQUESTS / 2 / (MAX_SEGMENTS / 2))
+
+struct tapdisk_stream_poll {
+       int                              pipe[2];
+       int                              set;
+};
+
+struct tapdisk_stream_request {
+       uint64_t                         sec;
+       uint32_t                         secs;
+       uint64_t                         seqno;
+       blkif_request_t                  blkif_req;
+       struct list_head                 next;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+
+       int                              err;
+
+       uint64_t                         cur;
+       uint64_t                         start;
+       uint64_t                         end;
+
+       uint64_t                         started;
+       uint64_t                         completed;
+
+       struct tapdisk_stream_poll       poll;
+       event_id_t                       enqueue_event_id;
+
+       struct list_head                 free_list;
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       struct tapdisk_stream_request    requests[MAX_STREAM_REQUESTS];
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(struct tapdisk_stream *);
+
+static char *program;
+static struct tapdisk_stream stream1, stream2;
+static vhd_context_t vhd1;
+
+static void
+usage(FILE *stream)
+{
+       printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n",
+                       program);
+}
+
+static int
+open_vhd(const char *path, vhd_context_t *vhd)
+{
+       int err;
+
+       err = vhd_open(vhd, path, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", path, err);
+               return err;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err)
+       {
+               printf("error reading BAT for %s: %d\n", path, err);
+               vhd_close(vhd);
+               return err;
+       }
+
+       return 0;
+}
+
+static inline void
+tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p)
+{
+       p->set = 0;
+       p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1;
+}
+
+static int
+tapdisk_stream_poll_open(struct tapdisk_stream_poll *p)
+{
+       int err;
+
+       tapdisk_stream_poll_initialize(p);
+
+       err = pipe(p->pipe);
+       if (err)
+               return -errno;
+
+       err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK);
+       if (err)
+               goto out;
+
+       return 0;
+
+out:
+       close(p->pipe[POLL_READ]);
+       close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+       return -errno;
+}
+
+static void
+tapdisk_stream_poll_close(struct tapdisk_stream_poll *p)
+{
+       if (p->pipe[POLL_READ] != -1)
+               close(p->pipe[POLL_READ]);
+       if (p->pipe[POLL_WRITE] != -1)
+               close(p->pipe[POLL_WRITE]);
+       tapdisk_stream_poll_initialize(p);
+}
+
+static inline void
+tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p)
+{
+       int gcc, dummy;
+
+       gcc = read(p->pipe[POLL_READ], &dummy, sizeof(dummy));
+       p->set = 0;
+}
+
+static inline void
+tapdisk_stream_poll_set(struct tapdisk_stream_poll *p)
+{
+       int dummy = 0;
+
+       if (!p->set) {
+               int gcc = write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy));
+               p->set = 1;
+       }
+}
+
+static inline int
+tapdisk_stream_stop(struct tapdisk_stream *s)
+{
+       return ((s->cur == s->end || s->err) &&
+                       list_empty(&s->pending_list) && 
+                       list_empty(&s->completed_list));
+}
+
+static inline void
+tapdisk_stream_initialize_request(struct tapdisk_stream_request *req)
+{
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->next);
+}
+
+static inline int
+tapdisk_stream_request_idx(struct tapdisk_stream *s,
+                          struct tapdisk_stream_request *req)
+{
+       return (req - s->requests);
+}
+
+static inline struct tapdisk_stream_request *
+tapdisk_stream_get_request(struct tapdisk_stream *s)
+{
+       struct tapdisk_stream_request *req;
+
+       if (list_empty(&s->free_list))
+               return NULL;
+
+       req = list_entry(s->free_list.next,
+                        struct tapdisk_stream_request, next);
+
+       list_del_init(&req->next);
+       tapdisk_stream_initialize_request(req);
+
+       return req;
+}
+
+static inline void
+tapdisk_stream_queue_completed(struct tapdisk_stream *s,
+                              struct tapdisk_stream_request *sreq)
+{
+       struct tapdisk_stream_request *itr;
+
+       list_for_each_entry(itr, &s->completed_list, next)
+               if (sreq->seqno < itr->seqno) {
+                       list_add_tail(&sreq->next, &itr->next);
+                       return;
+               }
+
+       list_add_tail(&sreq->next, &s->completed_list);
+}
+
+static int 
+tapdisk_result_compare(struct tapdisk_stream_request *sreq1,
+               struct tapdisk_stream_request  *sreq2)
+{
+       unsigned long idx1, idx2;
+       char *buf1, *buf2;
+       int result;
+
+       assert(sreq1->seqno == sreq2->seqno);
+       assert(sreq1->secs == sreq2->secs);
+       idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1, 
+                       sreq1);
+       idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2,
+                       sreq2);
+       buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0);
+       buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0);
+
+       result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT);
+       return result;
+}
+
+static int
+tapdisk_stream_process_data(void)
+{
+       struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2;
+       int advance_both;
+       int result = 0;
+
+       sreq1 = list_entry(stream1.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       sreq2 = list_entry(stream2.completed_list.next,
+                       struct tapdisk_stream_request, next);
+       tmp1 = list_entry(sreq1->next.next,
+                       struct tapdisk_stream_request, next);
+       tmp2 = list_entry(sreq2->next.next,
+                       struct tapdisk_stream_request, next);
+       while (result == 0 &&
+                       &sreq1->next != &stream1.completed_list &&
+                       &sreq2->next != &stream2.completed_list) {
+               //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno);
+               advance_both = 1;
+               if (sreq1->seqno < sreq2->seqno) {
+                       advance_both = 0;
+                       goto advance1;
+               }
+               if (sreq1->seqno > sreq2->seqno)
+                       goto advance2;
+
+               result = tapdisk_result_compare(sreq1, sreq2);
+
+               stream1.completed++;
+               stream2.completed++;
+               
+               list_del_init(&sreq1->next);
+               list_add_tail(&sreq1->next, &stream1.free_list);
+               list_del_init(&sreq2->next);
+               list_add_tail(&sreq2->next, &stream2.free_list);
+
+advance1:
+               sreq1 = tmp1;
+               tmp1 = list_entry(tmp1->next.next, 
+                               struct tapdisk_stream_request, next);
+               if (!advance_both)
+                       continue;
+advance2:
+               sreq2 = tmp2;
+               tmp2 = list_entry(tmp2->next.next, 
+                               struct tapdisk_stream_request, next);
+       }
+
+       return result;
+}
+
+static void
+tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+       struct tapdisk_stream_request *sreq = s->requests + rsp->id;
+
+       list_del_init(&sreq->next);
+
+       if (rsp->status == BLKIF_RSP_OKAY)
+               tapdisk_stream_queue_completed(s, sreq);
+       else {
+               s->err = EIO;
+               list_add_tail(&sreq->next, &s->free_list);
+               fprintf(stderr, "error reading sector %llu (stream %d)\n",
+                               sreq->sec, (s == &stream2) + 1);
+       }
+
+       if (tapdisk_stream_process_data()) {
+               fprintf(stderr, "mismatch at sector %llu\n",
+                               sreq->sec);
+               stream1.err = EINVAL;
+               stream2.err = EINVAL;
+       }
+
+       tapdisk_stream_poll_set(&stream1.poll);
+       tapdisk_stream_poll_set(&stream2.poll);
+}
+
+static inline int
+tapdisk_stream_enqueue_copy(struct tapdisk_stream *s, 
+               struct tapdisk_stream_request *r)
+{
+       td_vbd_t *vbd;
+       blkif_request_t *breq;
+       td_vbd_request_t *vreq;
+       struct tapdisk_stream_request *sreq;
+       int idx;
+
+       vbd = stream2.vbd;
+       sreq = tapdisk_stream_get_request(s);
+       if (!sreq)
+               return 1;
+
+       idx                 = tapdisk_stream_request_idx(s, sreq);
+
+       sreq->sec           = r->sec;
+       sreq->secs          = r->secs;
+       sreq->seqno         = r->seqno;
+
+       breq                = &sreq->blkif_req;
+       breq->id            = idx;
+       breq->nr_segments   = r->blkif_req.nr_segments;
+       breq->sector_number = r->blkif_req.sector_number;
+       breq->operation     = BLKIF_OP_READ;
+
+       for (int i = 0; i < r->blkif_req.nr_segments; i++) {
+               struct blkif_request_segment *seg = breq->seg + i;
+               seg->first_sect = r->blkif_req.seg[i].first_sect;
+               seg->last_sect  = r->blkif_req.seg[i].last_sect;
+       }
+       s->cur += sreq->secs;
+
+       vreq = vbd->request_list + idx;
+       assert(list_empty(&vreq->next));
+       assert(vreq->secs_pending == 0);
+
+       memcpy(&vreq->req, breq, sizeof(*breq));
+       s->started++;
+       vbd->received++;
+       vreq->vbd = vbd;
+
+       tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+       list_add_tail(&sreq->next, &s->pending_list);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_enqueue1(void)
+{
+       td_vbd_t *vbd;
+       int i, idx, psize, blk;
+       struct tapdisk_stream *s = &stream1;
+
+       vbd = s->vbd;
+       psize = getpagesize();
+
+       while (s->cur < s->end && !s->err) {
+               blkif_request_t *breq;
+               td_vbd_request_t *vreq;
+               struct tapdisk_stream_request *sreq;
+
+               /* skip any blocks that are not present in this image */
+               blk = s->cur >> SPB_SHIFT;
+               while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) {
+                       //printf("skipping block %d\n", blk);
+                       blk++;
+                       s->cur = blk << SPB_SHIFT;
+               }
+
+               if (s->cur >= s->end)
+                       break;
+
+               sreq = tapdisk_stream_get_request(s);
+               if (!sreq)
+                       break;
+
+               idx                 = tapdisk_stream_request_idx(s, sreq);
+
+               sreq->sec           = s->cur;
+               sreq->secs          = 0;
+               sreq->seqno         = s->started++;
+
+               breq                = &sreq->blkif_req;
+               breq->id            = idx;
+               breq->nr_segments   = 0;
+               breq->sector_number = sreq->sec;
+               breq->operation     = BLKIF_OP_READ;
+
+               for (i = 0; i < MAX_SEGMENTS; i++) {
+                       uint32_t secs;
+                       struct blkif_request_segment *seg = breq->seg + i;
+
+                       secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT);
+                       secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs);
+                       if (!secs)
+                               break;
+
+                       sreq->secs += secs;
+                       s->cur     += secs;
+
+                       seg->first_sect = 0;
+                       seg->last_sect  = secs - 1;
+                       breq->nr_segments++;
+               }
+
+               vreq = vbd->request_list + idx;
+
+               assert(list_empty(&vreq->next));
+               assert(vreq->secs_pending == 0);
+
+               memcpy(&vreq->req, breq, sizeof(*breq));
+               vbd->received++;
+               vreq->vbd = vbd;
+
+               tapdisk_vbd_move_request(vreq, &vbd->new_requests);
+               list_add_tail(&sreq->next, &s->pending_list);
+       }
+
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static void
+tapdisk_stream_enqueue2(void)
+{
+       td_vbd_t *vbd;
+       int i, blk;
+       struct tapdisk_stream_request *itr;
+       struct tapdisk_stream *s = &stream2;
+
+       vbd = s->vbd;
+
+       /* issue the same requests that we issued on stream1 */
+       list_for_each_entry(itr, &stream1.completed_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       list_for_each_entry(itr, &stream1.pending_list, next) {
+               if (itr->sec < s->cur)
+                       continue;
+               if (tapdisk_stream_enqueue_copy(s, itr))
+                       goto done;
+       }
+
+       stream2.cur = stream1.cur;
+
+done:
+       tapdisk_vbd_issue_requests(vbd);
+}
+
+static inline int
+tapdisk_diff_done(void)
+{
+       return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2));
+}
+
+static void
+tapdisk_diff_stop(void)
+{
+       tapdisk_stream_close_image(&stream1);
+       tapdisk_stream_close_image(&stream2);
+}
+
+static void
+tapdisk_stream_enqueue(event_id_t id, char mode, void *arg)
+{
+       struct tapdisk_stream *s = (struct tapdisk_stream *)arg;
+
+       tapdisk_stream_poll_clear(&s->poll);
+
+       if (tapdisk_diff_done()) {
+               tapdisk_diff_stop();
+               return;
+       }
+
+       if (s == &stream1) 
+               tapdisk_stream_enqueue1();
+       else if (s == &stream2)
+               tapdisk_stream_enqueue2();
+       else
+               assert(0);
+
+       if (tapdisk_diff_done()) {
+               // we have to check again for the case when stream1 had no 
+               // blocks at all
+               tapdisk_diff_stop();
+               return;
+       }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *name)
+{
+       int err;
+       td_disk_info_t info;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_vbd_initialize(-1, -1, s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s);
+
+       err = tapdisk_vbd_open_vdi(s->vbd, name, TD_OPEN_RDONLY, -1);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_get_disk_info(s->vbd, &info);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       s->start = 0;
+       s->cur   = s->start;
+       s->end   = info.size;
+
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open image %s: %d\n", name, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(struct tapdisk_stream *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free((void *)vbd->ring.vstart);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_initialize_requests(struct tapdisk_stream *s)
+{
+       size_t size;
+       td_ring_t *ring;
+       int err, i, psize;
+
+       ring  = &s->vbd->ring;
+       psize = getpagesize();
+       size  = psize * BLKTAP_MMAP_REGION_SIZE;
+
+       /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */
+       err = posix_memalign((void **)&ring->vstart, psize, size);
+       if (err) {
+               fprintf(stderr, "failed to allocate buffers: %d\n", err);
+               ring->vstart = 0;
+               return err;
+       }
+
+       for (i = 0; i < MAX_STREAM_REQUESTS; i++) {
+               struct tapdisk_stream_request *req = s->requests + i;
+               tapdisk_stream_initialize_request(req);
+               list_add_tail(&req->next, &s->free_list);
+       }
+
+       return 0;
+}
+
+static int
+tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s)
+{
+       int err;
+       struct tapdisk_stream_poll *p = &s->poll;
+
+       err = tapdisk_stream_poll_open(p);
+       if (err)
+               goto out;
+
+       err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                           p->pipe[POLL_READ], 0,
+                                           tapdisk_stream_enqueue, s);
+       if (err < 0)
+               goto out;
+
+       s->enqueue_event_id = err;
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to register event: %d\n", err);
+       return err;
+}
+
+static void
+tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s)
+{
+       if (s->enqueue_event_id) {
+               tapdisk_server_unregister_event(s->enqueue_event_id);
+               s->enqueue_event_id = 0;
+       }
+       tapdisk_stream_poll_close(&s->poll);
+}
+
+static inline void
+tapdisk_stream_initialize(struct tapdisk_stream *s)
+{
+       memset(s, 0, sizeof(*s));
+       INIT_LIST_HEAD(&s->free_list);
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *arg)
+{
+       int err;
+
+       tapdisk_stream_initialize(s);
+
+       err = tapdisk_stream_open_image(s, arg);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_initialize_requests(s);
+       if (err)
+               return err;
+
+       err = tapdisk_stream_register_enqueue_event(s);
+       if (err)
+               return err;
+
+       tapdisk_stream_enqueue(s->enqueue_event_id, 
+                              SCHEDULER_POLL_READ_FD, s);
+
+       return 0;
+}
+
+static void
+tapdisk_stream_release(struct tapdisk_stream *s)
+{
+       tapdisk_stream_close_image(s);
+       tapdisk_stream_unregister_enqueue_event(s);
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err, type1;
+       const char *arg1 = NULL, *arg2 = NULL;
+       const disk_info_t *info;
+       const char *path1;
+
+       err    = 0;
+
+       program = basename(argv[0]);
+       
+       while ((c = getopt(argc, argv, "n:m:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       arg1 = optarg;
+                       break;
+               case 'm':
+                       arg2 = optarg;
+                       break;
+               case 'h':
+                       usage(stdout);
+                       return 0;
+               default:
+                       goto fail_usage;
+               }
+       }
+
+       if (!arg1 || !arg2)
+               goto fail_usage;
+
+       type1 = tapdisk_disktype_parse_params(arg1, &path1);
+       if (type1 < 0)
+               return type1;
+
+       if (type1 != DISK_TYPE_VHD) {
+               printf("error: first VDI is not VHD\n");
+               return EINVAL;
+       }
+
+       err = open_vhd(path1, &vhd1);
+       if (err)
+               return err;
+
+       tapdisk_start_logging("tapdisk-diff", "daemon");
+
+       err = tapdisk_server_initialize(NULL, NULL);
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_open(&stream1, arg1);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg1, strerror(-err));
+               goto out;
+       }
+
+       err = tapdisk_stream_open(&stream2, arg2);
+       if (err) {
+               fprintf(stderr, "Failed to open %s: %s\n", 
+                       arg2, strerror(-err));
+               goto out1;
+       }
+
+       if (stream1.end != stream2.end) {
+               fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n",
+                               stream1.end, stream2.end);
+               err = EINVAL;
+               goto out2;
+       }
+
+       tapdisk_server_run();
+       
+out2:
+       tapdisk_stream_release(&stream2);
+out1:
+       tapdisk_stream_release(&stream1);
+out:
+       vhd_close(&vhd1);
+       tapdisk_stop_logging();
+
+       return err ? : stream1.err;
+
+fail_usage:
+       usage(stderr);
+       return 1;
+}
diff --git a/tools/blktap3/drivers/tapdisk-disktype.c 
b/tools/blktap3/drivers/tapdisk-disktype.c
--- a/tools/blktap3/drivers/tapdisk-disktype.c
+++ b/tools/blktap3/drivers/tapdisk-disktype.c
@@ -126,6 +126,12 @@ static const disk_info_t valve_disk = {
     DISK_TYPE_FILTER,
 };
 
+static const disk_info_t nbd_disk = {
+       "nbd",
+       "export to a NBD server",
+       0,
+};
+
 const disk_info_t *tapdisk_disk_types[] = {
        [DISK_TYPE_AIO] = &aio_disk,
        [DISK_TYPE_SYNC]        = &sync_disk,
@@ -142,6 +148,7 @@ const disk_info_t *tapdisk_disk_types[] 
     [DISK_TYPE_VALVE] = &valve_disk,
     [DISK_TYPE_LLPCACHE] = &llpcache_disk,
     [DISK_TYPE_LLECACHE] = &llecache_disk,
+       [DISK_TYPE_NBD]         = &nbd_disk,
        0,
 };
 
@@ -155,7 +162,6 @@ extern struct tap_disk tapdisk_sync;
 extern struct tap_disk tapdisk_vmdk;
 extern struct tap_disk tapdisk_vhdsync;
 #endif
-
 extern struct tap_disk tapdisk_vhd;
 extern struct tap_disk tapdisk_ram;
 
@@ -165,7 +171,7 @@ extern struct tap_disk tapdisk_ram;
 #if 0
 extern struct tap_disk tapdisk_qcow;
 #endif
-
+extern struct tap_disk tapdisk_block_cache;
 extern struct tap_disk tapdisk_vhd_index;
 
 /*
@@ -174,6 +180,11 @@ extern struct tap_disk tapdisk_vhd_index
 #if 0
 extern struct tap_disk tapdisk_log;
 #endif
+extern struct tap_disk tapdisk_lcache;
+extern struct tap_disk tapdisk_llpcache;
+extern struct tap_disk tapdisk_llecache;
+extern struct tap_disk tapdisk_valve;
+extern struct tap_disk tapdisk_nbd;
 
 const struct tap_disk *
 tapdisk_disk_driver_get(const enum disk_type dt)
@@ -190,13 +201,7 @@ tapdisk_disk_driver_get(const enum disk_
         [DISK_TYPE_VHDSYNC] = &tapdisk_vhdsync_disk
 #endif
         [DISK_TYPE_VHD]         = &tapdisk_vhd,
-
-        /*
-         * TODO Commeneted out to simplify the upstreaming process.
-         */
-#if 0
         [DISK_TYPE_RAM]         = &tapdisk_ram,
-#endif
 
         /*
          * XXX Commented out in blktap2.5.
@@ -205,13 +210,8 @@ tapdisk_disk_driver_get(const enum disk_
         [DISK_TYPE_QCOW]        = &tapdisk_qcow,
 #endif
 
-        /*
-         * TODO Commeneted out to simplify the upstreaming process.
-         */
-#if 0
         [DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache,
            [DISK_TYPE_VINDEX]      = &tapdisk_vhd_index,
-#endif
 
         /*
          * XXX Commented out in blktap2.5.
@@ -220,16 +220,12 @@ tapdisk_disk_driver_get(const enum disk_
         [DISK_TYPE_LOG]         = &tapdisk_log,
 #endif
 
-        /*
-         * TODO Commeneted out to simplify the upstreaming process.
-         */
-#if 0
         [DISK_TYPE_LCACHE]      = &tapdisk_lcache,
         [DISK_TYPE_LLPCACHE]    = &tapdisk_llpcache,
         [DISK_TYPE_LLECACHE]    = &tapdisk_llecache,
         [DISK_TYPE_VALVE]       = &tapdisk_valve,
         [DISK_TYPE_NBD]         = &tapdisk_nbd,
-#endif
+               0
     };
 
     if (dt < 0 || dt > ARRAY_SIZE(tapdisk_disk_drivers))
diff --git a/tools/blktap3/drivers/tapdisk-disktype.h 
b/tools/blktap3/drivers/tapdisk-disktype.h
--- a/tools/blktap3/drivers/tapdisk-disktype.h
+++ b/tools/blktap3/drivers/tapdisk-disktype.h
@@ -44,7 +44,8 @@ enum disk_type {
     DISK_TYPE_LCACHE,
     DISK_TYPE_LLECACHE,
     DISK_TYPE_LLPCACHE,
-    DISK_TYPE_VALVE};
+    DISK_TYPE_VALVE,
+    DISK_TYPE_NBD};
 
 #define DISK_TYPE_NAME_MAX    32
 
diff --git a/tools/blktap3/drivers/tapdisk-driver.c 
b/tools/blktap3/drivers/tapdisk-driver.c
--- a/tools/blktap3/drivers/tapdisk-driver.c
+++ b/tools/blktap3/drivers/tapdisk-driver.c
@@ -40,7 +40,7 @@ tapdisk_driver_log_flush(td_driver_t * d
     td_loglimit_t *rl = &driver->loglimit;
 
     if (rl->dropped) {
-        tlog_syslog(LOG_WARNING,
+        tlog_syslog(TLOG_WARN,
                     "%s: %s: %d messages suppressed",
                     driver->name, __caller, rl->dropped);
         rl->dropped = 0;
@@ -58,7 +58,7 @@ int tapdisk_driver_log_pass(td_driver_t 
     }
 
     if (!dropping)
-        tlog_syslog(LOG_WARNING,
+        tlog_syslog(TLOG_WARN,
                     "%s: %s: too many errors, dropped.",
                     driver->name, __caller);
 
diff --git a/tools/blktap3/drivers/tapdisk-fdreceiver.c 
b/tools/blktap3/drivers/tapdisk-fdreceiver.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-fdreceiver.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/wait.h>
+#include <sys/un.h>
+
+#include "tapdisk.h"
+#include "tapdisk-fdreceiver.h"
+#include "tapdisk-server.h"
+#include "scheduler.h"
+
+#define UNIX_BUFFER_SIZE 16384
+
+#define INFO(_f, _a...)            tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a)
+#define ERROR(_f, _a...)           tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a)
+
+static void 
+td_fdreceiver_recv_fd(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       struct td_fdreceiver *fdreceiver = data;
+       int ret,  cv_flags = 0, *fdp, fd = -1;
+       long numbytes;
+       char iobuf[UNIX_BUFFER_SIZE];
+       char buf[CMSG_SPACE(sizeof(fd))];
+       struct sockaddr_un unix_socket_name;
+
+       struct msghdr msg;
+       struct iovec vec;
+       struct cmsghdr *cmsg;
+
+       numbytes = UNIX_BUFFER_SIZE;
+
+       bzero(iobuf, numbytes);
+
+       msg.msg_name = &unix_socket_name;
+       msg.msg_namelen = sizeof(unix_socket_name);
+       vec.iov_base = iobuf;
+       vec.iov_len = numbytes;
+       msg.msg_iov = &vec;
+
+       msg.msg_iovlen = 1;
+
+       msg.msg_control = buf;
+       msg.msg_controllen = sizeof(buf);
+
+       ret = recvmsg(fdreceiver->client_fd, &msg, cv_flags);
+
+       if (ret == -1) {
+               ERROR("Failed to receive the message: %d", ret);
+               return;
+       }
+
+       if (ret > 0 && msg.msg_controllen > 0) {
+               cmsg = CMSG_FIRSTHDR(&msg);
+               if (cmsg->cmsg_level == SOL_SOCKET && 
+                               (cmsg->cmsg_type == SCM_RIGHTS)) {
+                       fdp = (int*)CMSG_DATA(cmsg);
+                       fd = *fdp;
+               } else {
+                       ERROR("Failed to recieve a file descriptor");
+               }
+       } else {
+               fd = -1;
+       }
+
+       if (ret < numbytes)
+               numbytes = ret;
+
+       INFO("Recieved fd with message: %s", iobuf);
+
+       /*
+        * We're done with this connection, it was only transiently used to 
+        * connect the client
+        */
+       close(fdreceiver->client_fd);
+       fdreceiver->client_fd = -1;
+
+       tapdisk_server_unregister_event(fdreceiver->client_event_id);
+       fdreceiver->client_event_id = -1;
+
+       /*
+        * It is the responsibility of this callback function to arrange that 
+        * the fd is eventually closed
+        */
+       fdreceiver->callback(fd, iobuf, fdreceiver->callback_data);
+}
+
+static void 
+td_fdreceiver_accept_fd(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       struct sockaddr_storage their_addr;
+       socklen_t sin_size = sizeof(their_addr);
+       struct td_fdreceiver *fdreceiver = data;
+       int new_fd;
+
+       INFO("Unix domain socket is ready to accept");
+
+       new_fd = accept(fdreceiver->fd,
+                       (struct sockaddr *)&their_addr, &sin_size);
+
+       if (fdreceiver->client_fd != -1) {
+               ERROR("td_fdreceiver_accept_fd: can only cope with one connec"
+                               "tion at once to the unix domain socket!");
+               close(new_fd);
+               return;
+       }
+
+       fdreceiver->client_fd = new_fd;
+
+       fdreceiver->client_event_id =
+               tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                               fdreceiver->client_fd, 0,
+                               td_fdreceiver_recv_fd,
+                               fdreceiver);
+
+       if (fdreceiver->client_event_id < 0) {
+               ERROR("td_fdreceiver_accept_fd: failed to register event "
+                               "(errno=%d)", errno);
+               close(new_fd);
+               fdreceiver->client_fd = -1;
+       }
+}
+
+void
+td_fdreceiver_stop(struct td_fdreceiver *fdreceiver)
+{
+       if (fdreceiver->client_fd >= 0)
+               close(fdreceiver->client_fd);
+
+       if (fdreceiver->client_event_id >= 0)
+               tapdisk_server_unregister_event(fdreceiver->client_event_id);
+
+       if (fdreceiver->fd >= 0)
+               close(fdreceiver->fd);
+
+       if (fdreceiver->fd_event_id >= 0)
+               tapdisk_server_unregister_event(fdreceiver->fd_event_id);
+
+       if (fdreceiver->path != NULL) {
+               unlink(fdreceiver->path);
+               free(fdreceiver->path);
+       }
+
+       free(fdreceiver);
+}
+
+struct td_fdreceiver *
+td_fdreceiver_start(char *path, fd_cb_t callback, void *data)
+{
+       unsigned int s = -1;
+       struct sockaddr_un local;
+       int len;
+       int err;
+       struct td_fdreceiver *fdreceiver;
+
+       fdreceiver = malloc(sizeof(struct td_fdreceiver));
+       if (!fdreceiver) {
+               ERROR("td_fdreceiver_start: error allocating memory for "
+                               "fdreceiver (path=%s)", path);
+               goto error;
+       }
+
+       fdreceiver->path = strdup(path);
+       fdreceiver->fd = -1;
+       fdreceiver->fd_event_id = -1;
+       fdreceiver->client_fd = -1;
+       fdreceiver->client_event_id = -1;
+       fdreceiver->callback = callback;
+       fdreceiver->callback_data = data;
+
+       snprintf(local.sun_path, sizeof(local.sun_path), "%s", path);
+       local.sun_family = AF_UNIX;  
+
+       /* 
+        * NB: here we unlink anything that was there before - be very careful 
+        * with the paths you pass to this function!
+        */
+       unlink(local.sun_path);
+       len = strlen(local.sun_path) + sizeof(local.sun_family);
+
+       s = socket(AF_UNIX, SOCK_STREAM, 0);
+
+       if (s < 0) {
+               ERROR("td_fdreceiver_start: error creating socket "
+                               "(path=%s)", path);
+               goto error;
+       }
+
+       err = bind(s, (struct sockaddr *)&local, len);
+       if (err < 0) {
+               ERROR("td_fdreceiver_start: error binding (path=%s)", path);
+               goto error;
+       }
+
+       err = listen(s, 5);
+       if (err < 0) {
+               ERROR("td_fdreceiver_start: error listening (path=%s)", path);
+               goto error;
+       }
+
+       fdreceiver->fd = s;
+
+       fdreceiver->fd_event_id =
+               tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                               fdreceiver->fd, 0,
+                               td_fdreceiver_accept_fd,
+                               fdreceiver);
+
+       if (fdreceiver->fd_event_id < 0) {
+               ERROR("td_fdreceiver_start: error registering event "
+                               "(path=%s)", path);
+               goto error;
+       }
+
+       INFO("Set up local unix domain socket on path '%s'", path);
+
+       return fdreceiver;
+
+error:
+       free(fdreceiver);
+
+       if (s >= 0)
+               close(s);
+
+       return NULL;
+}
diff --git a/tools/blktap3/drivers/tapdisk-fdreceiver.h 
b/tools/blktap3/drivers/tapdisk-fdreceiver.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-fdreceiver.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Unix domain socket fd receiver */
+
+typedef void (*fd_cb_t) (int fd, char *msg, void *data);
+
+struct td_fdreceiver *td_fdreceiver_start(char *path, fd_cb_t, void *data);
+void td_fdreceiver_stop(struct td_fdreceiver *);
+
+struct td_fdreceiver {
+       char *path;
+
+       int fd;
+       int fd_event_id;
+
+       int client_fd;
+       int client_event_id;
+
+       fd_cb_t callback;
+       void *callback_data;
+};
diff --git a/tools/blktap3/drivers/tapdisk-image.c 
b/tools/blktap3/drivers/tapdisk-image.c
--- a/tools/blktap3/drivers/tapdisk-image.c
+++ b/tools/blktap3/drivers/tapdisk-image.c
@@ -499,7 +499,6 @@ tapdisk_image_open_chain(const char *par
     type = tapdisk_disktype_parse_params(params, &name);
     if (type >= 0) {
         err = __tapdisk_image_open_chain(type, name, flags, head, prt_path);
-        BUG_ON(TAILQ_EMPTY(head));
         return err;
     }
 
diff --git a/tools/blktap3/drivers/tapdisk-interface.c 
b/tools/blktap3/drivers/tapdisk-interface.c
--- a/tools/blktap3/drivers/tapdisk-interface.c
+++ b/tools/blktap3/drivers/tapdisk-interface.c
@@ -121,7 +121,7 @@ td_close(td_image_t * image)
 
        driver->refcnt--;
        if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) {
-               driver->ops->td_close(driver);
+               driver->ops->td_close(driver, NULL);
                td_flag_clear(driver->state, TD_DRIVER_OPEN);
        }
 
diff --git a/tools/blktap3/drivers/tapdisk-log.c 
b/tools/blktap3/drivers/tapdisk-log.c
--- a/tools/blktap3/drivers/tapdisk-log.c
+++ b/tools/blktap3/drivers/tapdisk-log.c
@@ -88,7 +88,7 @@ static void tlog_logfile_save(void)
 
     err = tapdisk_logfile_rename(logfile, TLOG_DIR, name, ".log");
 
-    tlog_syslog(LOG_INFO, "logfile saved to %s: %d\n", logfile->path, err);
+    tlog_syslog(TLOG_INFO, "logfile saved to %s: %d\n", logfile->path, err);
 }
 
 static void tlog_logfile_close(void)
@@ -172,6 +172,9 @@ void tlog_vsyslog(int prio, const char *
 void tlog_syslog(int prio, const char *fmt, ...)
 {
     va_list ap;
+    static const int tlog_to_syslog[3] = {LOG_WARNING, LOG_INFO, LOG_DEBUG};
+
+    prio = prio >= 0 && prio < 3 ? tlog_to_syslog[prio] : LOG_INFO;
 
        va_start(ap, fmt);
     tlog_vsyslog(prio, fmt, ap);
diff --git a/tools/blktap3/drivers/tapdisk-log.h 
b/tools/blktap3/drivers/tapdisk-log.h
--- a/tools/blktap3/drivers/tapdisk-log.h
+++ b/tools/blktap3/drivers/tapdisk-log.h
@@ -25,6 +25,7 @@
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+
 #ifndef _TAPDISK_LOG_H_
 #define _TAPDISK_LOG_H_
 
diff --git a/tools/blktap3/drivers/tapdisk-nbd.h 
b/tools/blktap3/drivers/tapdisk-nbd.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-nbd.h
@@ -0,0 +1,89 @@
+/*
+ * 1999 Copyright (C) Pavel Machek, pavel@xxxxxxx This code is GPL.
+ * 1999/11/04 Copyright (C) 1999 VMware, Inc. (Regis "HPReg" Duchesne)
+ *            Made nbd_end_request() use the io_request_lock
+ * 2001 Copyright (C) Steven Whitehouse
+ *            New nbd_end_request() for compatibility with new linux block
+ *            layer code.
+ * 2003/06/24 Louis D. Langholtz <ldl@xxxxxxxx>
+ *            Removed unneeded blksize_bits field from nbd_device struct.
+ *            Cleanup PARANOIA usage & code.
+ * 2004/02/19 Paul Clements
+ *            Removed PARANOIA, plus various cleanup and comments
+ */
+
+#ifndef LINUX_NBD_H
+#define LINUX_NBD_H
+
+//#include <linux/types.h>
+
+#define NBD_NEGOTIATION_MAGIC 0x00420281861253LL
+
+#define NBD_SET_SOCK   _IO( 0xab, 0 )
+#define NBD_SET_BLKSIZE        _IO( 0xab, 1 )
+#define NBD_SET_SIZE   _IO( 0xab, 2 )
+#define NBD_DO_IT      _IO( 0xab, 3 )
+#define NBD_CLEAR_SOCK _IO( 0xab, 4 )
+#define NBD_CLEAR_QUE  _IO( 0xab, 5 )
+#define NBD_PRINT_DEBUG        _IO( 0xab, 6 )
+#define NBD_SET_SIZE_BLOCKS    _IO( 0xab, 7 )
+#define NBD_DISCONNECT  _IO( 0xab, 8 )
+#define NBD_SET_TIMEOUT _IO( 0xab, 9 )
+#define NBD_SET_FLAGS _IO( 0xab, 10 )
+
+enum {
+       NBD_CMD_READ = 0,
+       NBD_CMD_WRITE = 1,
+       NBD_CMD_DISC = 2,
+       NBD_CMD_FLUSH = 3,
+       NBD_CMD_TRIM = 4
+};
+
+#define NBD_CMD_MASK_COMMAND 0x0000ffff
+#define NBD_CMD_FLAG_FUA (1<<16)
+
+/* values for flags field */
+#define NBD_FLAG_HAS_FLAGS      (1 << 0) /* Flags are there */
+#define NBD_FLAG_READ_ONLY      (1 << 1) /* Device is read-only */
+#define NBD_FLAG_SEND_FLUSH     (1 << 2) /* Send FLUSH */
+#define NBD_FLAG_SEND_FUA       (1 << 3) /* Send FUA (Force Unit Access) */
+#define NBD_FLAG_ROTATIONAL     (1 << 4) /* Use elevator algorithm -
+                                           rotational media */
+#define NBD_FLAG_SEND_TRIM      (1 << 5) /* Send TRIM (discard) */
+
+#define nbd_cmd(req) ((req)->cmd[0])
+
+/* userspace doesn't need the nbd_device structure */
+
+/* These are sent over the network in the request/reply magic fields */
+
+#define NBD_REQUEST_MAGIC 0x25609513
+#define NBD_REPLY_MAGIC 0x67446698
+/* Do *not* use magics: 0x12560953 0x96744668. */
+
+#define __be32 uint32_t
+#define __be64 uint64_t
+
+
+/*
+ * This is the packet used for communication between client and
+ * server. All data are in network byte order.
+ */
+struct nbd_request {
+       __be32 magic;
+       __be32 type;    /* == READ || == WRITE  */
+       char handle[8];
+       __be64 from;
+       __be32 len;
+} __attribute__ ((packed));
+
+/*
+ * This is the reply packet that nbd-server sends back to the client after
+ * it has completed an I/O request (or an error occurs).
+ */
+struct nbd_reply {
+       __be32 magic;
+       __be32 error;           /* 0 = ok, else error   */
+       char handle[8];         /* handle you got from request  */
+};
+#endif
diff --git a/tools/blktap3/drivers/tapdisk-nbdserver.c 
b/tools/blktap3/drivers/tapdisk-nbdserver.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-nbdserver.c
@@ -0,0 +1,712 @@
+/*
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <sys/wait.h>
+#include <sys/un.h>
+
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "tapdisk-utils.h"
+#include "tapdisk-nbdserver.h"
+#include "tapdisk-fdreceiver.h"
+
+#include "tapdisk-nbd.h"
+
+#define NBD_SERVER_NUM_REQS TAPDISK_DATA_REQUESTS
+
+#define TAPDISK_NBDSERVER_LISTEN_SOCK_PATH "/var/run/blktap-control/nbdserver"
+#define TAPDISK_NBDSERVER_MAX_PATH_LEN 256
+
+/*
+ * Server
+ */
+
+#define INFO(_f, _a...)            tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a)
+#define ERROR(_f, _a...)           tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a)
+
+struct td_nbdserver_req {
+       td_vbd_request_t        vreq;
+       char                    id[16];
+       struct td_iovec         iov;
+};
+
+static void tapdisk_nbdserver_disable_client(td_nbdserver_client_t *client);
+static void tapdisk_nbdserver_clientcb(event_id_t id, char mode, void *data);
+int tapdisk_nbdserver_setup_listening_socket(td_nbdserver_t *server);
+int tapdisk_nbdserver_unpause(td_nbdserver_t *server);
+
+static td_nbdserver_req_t *
+tapdisk_nbdserver_alloc_request(td_nbdserver_client_t *client)
+{
+       td_nbdserver_req_t *req = NULL;
+
+       if (likely(client->n_reqs_free))
+               req = client->reqs_free[--client->n_reqs_free];
+
+       return req;
+}
+
+static void
+tapdisk_nbdserver_free_request(td_nbdserver_client_t *client,
+               td_nbdserver_req_t *req)
+{
+       if (client->n_reqs_free >= client->n_reqs) {
+               ERROR("Error, trying to free a client, but the free list "
+                               "is full! leaking!");
+               return;
+       }
+       client->reqs_free[client->n_reqs_free++] = req;
+}
+
+static void
+tapdisk_nbdserver_reqs_free(td_nbdserver_client_t *client)
+{
+       if (client->reqs) {
+               free(client->reqs);
+               client->reqs = NULL;
+       }
+
+       if (client->iovecs) {
+               free(client->iovecs);
+               client->iovecs = NULL;
+       }
+
+       if (client->reqs_free) {
+               free(client->reqs_free);
+               client->reqs_free = NULL;
+       }
+}
+
+static int
+tapdisk_nbdserver_reqs_init(td_nbdserver_client_t *client, int n_reqs)
+{
+       int i, err;
+
+       INFO("Reqs init");
+
+       client->reqs = malloc(n_reqs * sizeof(td_nbdserver_req_t));
+       if (!client->reqs) {
+               err = -errno;
+               goto fail;
+       }
+       client->iovecs = malloc(n_reqs * sizeof(struct td_iovec));
+       if (!client->iovecs) {
+               err = - errno;
+               goto fail;
+       }
+
+       client->reqs_free = malloc(n_reqs * sizeof(td_nbdserver_req_t*));
+       if (!client->reqs_free) {
+               err = -errno;
+               goto fail;
+       }
+
+       client->n_reqs      = n_reqs;
+       client->n_reqs_free = 0;
+
+       for (i = 0; i < n_reqs; i++) {
+               client->reqs[i].vreq.iov = &client->iovecs[i];
+               tapdisk_nbdserver_free_request(client, &client->reqs[i]);
+       }
+
+       return 0;
+
+fail:
+       tapdisk_nbdserver_reqs_free(client);
+       return err;
+}
+
+static td_nbdserver_client_t *
+tapdisk_nbdserver_alloc_client(td_nbdserver_t *server)
+{
+       td_nbdserver_client_t *client = NULL;
+       int err;
+
+       INFO("Alloc client");
+
+       client = malloc(sizeof(td_nbdserver_client_t));
+       if (!client) {
+               ERROR("Couldn't allocate client structure: %s",
+                               strerror(errno));
+               goto fail;
+       }
+
+       bzero(client, sizeof(td_nbdserver_client_t));
+
+       err = tapdisk_nbdserver_reqs_init(client, NBD_SERVER_NUM_REQS);
+       if (err < 0) {
+               ERROR("Couldn't allocate client reqs: %d", err);
+               goto fail;
+       }
+
+       client->client_fd = -1;
+       client->client_event_id = -1;
+       client->server = server;
+    TAILQ_INSERT_HEAD(&server->clients, client, clientlist);
+
+       client->paused = 0;
+
+       return client;
+
+fail:
+       if (client) {
+               free(client);
+               client = NULL;
+       }
+
+       return client;
+}
+
+static void
+tapdisk_nbdserver_free_client(td_nbdserver_client_t *client)
+{
+       INFO("Free client");
+
+       if (!client) {
+               ERROR("Attempt to free NULL pointer!");
+               return;
+       }
+
+       if (client->client_event_id >= 0)
+               tapdisk_nbdserver_disable_client(client);
+
+    TAILQ_REMOVE(&client->server->clients, client, clientlist);
+       tapdisk_nbdserver_reqs_free(client);
+       free(client);
+}
+
+static int
+tapdisk_nbdserver_enable_client(td_nbdserver_client_t *client)
+{
+       INFO("Enable client");
+
+       if (client->client_event_id >= 0) {
+               ERROR("Attempting to enable an already-enabled client");
+               return -1;
+       }
+
+       if (client->client_fd < 0) {
+               ERROR("Attempting to register events on a closed client");
+               return -1;
+       }
+
+       client->client_event_id = tapdisk_server_register_event(
+                       SCHEDULER_POLL_READ_FD,
+                       client->client_fd, 0,
+                       tapdisk_nbdserver_clientcb,
+                       client);
+
+       if (client->client_event_id < 0) {
+               ERROR("Error registering events on client: %d",
+                               client->client_event_id);
+               return client->client_event_id;
+       }
+
+       return client->client_event_id;
+}
+
+static void
+tapdisk_nbdserver_disable_client(td_nbdserver_client_t *client)
+{
+       INFO("Disable client");
+
+       if (client->client_event_id < 0) {
+               ERROR("Attempting to disable an already-disabled client");
+               return;
+       }
+
+       tapdisk_server_unregister_event(client->client_event_id);
+       client->client_event_id = -1;
+}
+
+static void
+*get_in_addr(struct sockaddr_storage *ss)
+{
+       if (ss->ss_family == AF_INET)
+               return &(((struct sockaddr_in*)ss)->sin_addr);
+
+       return &(((struct sockaddr_in6*)ss)->sin6_addr);
+}
+
+static void
+__tapdisk_nbdserver_request_cb(td_vbd_request_t *vreq, int error,
+               void *token, int final __attribute__((unused)))
+{
+       td_nbdserver_client_t *client = token;
+       td_nbdserver_req_t *req = containerof(vreq, td_nbdserver_req_t, vreq);
+       struct nbd_reply reply;
+       int tosend = 0;
+       int sent = 0;
+       int len = 0;
+
+       reply.magic = htonl(NBD_REPLY_MAGIC);
+       reply.error = htonl(error);
+       memcpy(reply.handle, req->id, sizeof(reply.handle));
+
+       if (client->client_fd < 0) {
+               ERROR("Finishing request for client that has disappeared");
+               goto finish;
+       }
+
+       send(client->client_fd, &reply, sizeof(reply), 0);
+
+       switch(vreq->op) {
+       case TD_OP_READ:
+               tosend = len = vreq->iov->secs << SECTOR_SHIFT;
+               while (tosend > 0) {
+                       sent = send(client->client_fd,
+                                       vreq->iov->base + (len - tosend),
+                                       tosend, 0);
+                       if (sent <= 0) {
+                               ERROR("Short send or error in "
+                                               "callback: %d", sent);
+                               goto finish;
+                       }
+
+                       tosend -= sent;
+               }
+               break;
+       default:
+               break;
+       }
+
+finish:
+       free(vreq->iov->base);
+       tapdisk_nbdserver_free_request(client, req);
+}
+
+static void tapdisk_nbdserver_newclient_fd(td_nbdserver_t *server, int new_fd);
+
+static void
+tapdisk_nbdserver_clientcb(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       td_nbdserver_client_t *client = data;
+       td_nbdserver_t *server = client->server;
+       int rc;
+       int len;
+       int hdrlen;
+       int n;
+       int fd = client->client_fd;
+       char *ptr;
+       td_vbd_request_t *vreq;
+       struct nbd_request request;
+
+       td_nbdserver_req_t *req = tapdisk_nbdserver_alloc_request(client);
+
+       if (req == NULL) {
+               ERROR("Couldn't allocate request in clientcb - killing client");
+               tapdisk_nbdserver_free_client(client);
+               return;
+       }
+
+       vreq = &req->vreq;
+
+       memset(req, 0, sizeof(td_nbdserver_req_t));
+       /* Read the request the client has sent */
+
+       hdrlen = sizeof(struct nbd_request);
+
+       n = 0;
+       ptr = (char *) &request;
+       while (n < hdrlen) {
+               rc = recv(fd, ptr + n, hdrlen - n, 0);
+               if (rc == 0 || errno == ECONNRESET) {
+                       INFO("Client closed connection");
+                       goto fail;
+               }
+               if (rc < 0) {
+                       ERROR("Bad return in nbdserver_clientcb. Closing "
+                                       "connection: %s\n", strerror(errno));
+                       goto fail;
+               }
+               n += rc;
+       }
+
+       if (request.magic != htonl(NBD_REQUEST_MAGIC)) {
+               ERROR("Not enough magic");
+               goto fail;
+       }
+
+       request.from = ntohll(request.from);
+       request.type = ntohl(request.type);
+       len = ntohl(request.len);
+       if (((len & 0x1ff) != 0) || ((request.from & 0x1ff) != 0)) {
+               ERROR("Non sector-aligned request (%"PRIu64", %d)",
+                               request.from, len);
+       }
+
+       bzero(req->id, sizeof(req->id));
+       memcpy(req->id, request.handle, sizeof(request.handle));
+
+       rc = posix_memalign(&req->iov.base, 512, len);
+       if (rc < 0) {
+               ERROR("posix_memalign failed (%d)", rc);
+               goto fail;
+       }
+
+       vreq->sec = request.from >> SECTOR_SHIFT;
+       vreq->iovcnt = 1;
+       vreq->iov = &req->iov;
+       vreq->iov->secs = len >> SECTOR_SHIFT;
+       vreq->token = client;
+       vreq->cb = __tapdisk_nbdserver_request_cb;
+       vreq->name = req->id;
+       vreq->vbd = server->vbd;
+
+       switch(request.type) {
+       case NBD_CMD_READ:
+               vreq->op = TD_OP_READ;
+               break;
+       case NBD_CMD_WRITE:
+               vreq->op = TD_OP_WRITE;
+
+               n = 0;
+               while (n < len) {
+                       rc = recv(fd, vreq->iov->base + n, (len - n), 0);
+                       if (rc <= 0) {
+                               ERROR("Short send or error in "
+                                               "callback: %d", rc);
+                               goto fail;
+                       }
+
+                       n += rc;
+               };
+
+               break;
+       case NBD_CMD_DISC:
+               INFO("Received close message. Sending reconnect "
+                               "header");
+               tapdisk_nbdserver_free_client(client);
+               INFO("About to send initial connection message");
+               tapdisk_nbdserver_newclient_fd(server, fd);
+               INFO("Sent");
+               return;
+
+       default:
+               ERROR("Unsupported operation: 0x%x", request.type);
+               goto fail;
+       }
+
+       rc = tapdisk_vbd_queue_request(server->vbd, vreq);
+       if (rc) {
+               ERROR("tapdisk_vbd_queue_request failed: %d", rc);
+               goto fail;
+       }
+
+       return;
+
+fail:
+       tapdisk_nbdserver_free_client(client);
+       return;
+}
+
+static void
+tapdisk_nbdserver_newclient_fd(td_nbdserver_t *server, int new_fd)
+{
+       char buffer[256];
+       int rc;
+       uint64_t tmp64;
+       uint32_t tmp32;
+    td_nbdserver_client_t *client;
+
+       INFO("Got a new client!");
+
+       /* Spit out the NBD connection stuff */
+
+       memcpy(buffer, "NBDMAGIC", 8);
+       tmp64 = htonll(NBD_NEGOTIATION_MAGIC);
+       memcpy(buffer + 8, &tmp64, sizeof(tmp64));
+       tmp64 = htonll(server->info.size * server->info.sector_size);
+       memcpy(buffer + 16, &tmp64, sizeof(tmp64));
+       tmp32 = htonl(0);
+       memcpy(buffer + 24, &tmp32, sizeof(tmp32));
+       bzero(buffer + 28, 124);
+
+       rc = send(new_fd, buffer, 152, 0);
+
+       if (rc < 152) {
+               close(new_fd);
+               INFO("Short write in negotiation!");
+       }
+
+       INFO("About to alloc client");
+       client = tapdisk_nbdserver_alloc_client(server);
+       INFO("Got an allocated client at %p", client);
+       client->client_fd = new_fd;
+       INFO("About to enable client");
+
+       if (tapdisk_nbdserver_enable_client(client) < 0) {
+               ERROR("Error enabling client");
+               tapdisk_nbdserver_free_client(client);
+               close(new_fd);
+               return;
+       }
+}
+
+static void
+tapdisk_nbdserver_fdreceiver_cb(int fd, char *msg, void *data)
+{
+       td_nbdserver_t *server = data;
+       INFO("Received fd with msg: %s", msg);
+       tapdisk_nbdserver_newclient_fd(server, fd);
+}
+
+static void
+tapdisk_nbdserver_newclient(event_id_t id __attribute__((unused)),
+        char mode __attribute__((unused)), void *data)
+{
+       struct sockaddr_storage their_addr;
+       socklen_t sin_size = sizeof(their_addr);
+       char s[INET6_ADDRSTRLEN];
+       int new_fd;
+       td_nbdserver_t *server = data;
+
+       INFO("About to accept (server->listening_fd = %d)",
+                       server->listening_fd);
+       new_fd = accept(server->listening_fd, (struct sockaddr *)&their_addr,
+                       &sin_size);
+
+       if (new_fd == -1) {
+               ERROR("accept (%s)", strerror(errno));
+               return;
+       }
+
+       inet_ntop(their_addr.ss_family, get_in_addr(&their_addr), s, sizeof s);
+
+       INFO("server: got connection from %s\n", s);
+
+       tapdisk_nbdserver_newclient_fd(server, new_fd);
+}
+
+td_nbdserver_t *
+tapdisk_nbdserver_alloc(td_vbd_t *vbd, td_disk_info_t info)
+{
+       td_nbdserver_t *server;
+       char fdreceiver_path[TAPDISK_NBDSERVER_MAX_PATH_LEN];
+    int i;
+
+       server = malloc(sizeof(*server));
+       if (!server) {
+               ERROR("Failed to allocate memory for nbdserver: %s", 
strerror(errno));
+               return NULL;
+       }
+
+       memset(server, 0, sizeof(*server));
+       server->listening_fd = -1;
+       server->listening_event_id = -1;
+    TAILQ_INIT(&server->clients);
+
+       server->vbd = vbd;
+       server->info = info;
+
+       snprintf(fdreceiver_path, TAPDISK_NBDSERVER_MAX_PATH_LEN, "%s%d-%s",
+                       TAPDISK_NBDSERVER_LISTEN_SOCK_PATH, getpid(),
+                       vbd->name);
+
+    /*
+     * XXX The path we're supplying will be appended to the socket path, so it
+     * cannot contain the '/' character. We replace all '/' with '-'.
+     */
+    for (i = strlen(TAPDISK_NBDSERVER_LISTEN_SOCK_PATH);
+            fdreceiver_path[i] != '\0'; i++) {
+        if (fdreceiver_path[i] == '/') {
+            fdreceiver_path[i] = '-';
+        }
+    }
+
+       server->fdreceiver = td_fdreceiver_start(fdreceiver_path,
+                       tapdisk_nbdserver_fdreceiver_cb, server);
+
+       if (!server->fdreceiver) {
+               ERROR("Error setting up fd receiver");
+        /*
+         * TODO If td_fdreceiver_start failed, we don't have to call
+         * tapdisk_server_unregister_event, right?
+         */
+               tapdisk_server_unregister_event(server->listening_event_id);
+               close(server->listening_fd);
+               return NULL;
+       }
+
+       return server;
+}
+
+int
+tapdisk_nbdserver_listen(td_nbdserver_t *server, int port)
+{
+       struct addrinfo hints, *servinfo, *p;
+       char portstr[10];
+       int err;
+       int yes = 1;
+
+       memset(&hints, 0, sizeof(hints));
+       hints.ai_family = AF_UNSPEC;
+       hints.ai_socktype = SOCK_STREAM;
+       hints.ai_flags = AI_PASSIVE;
+
+       snprintf(portstr, 10, "%d", port);
+
+       if ((err = getaddrinfo(NULL, portstr, &hints, &servinfo)) != 0) {
+               ERROR("Failed to getaddrinfo");
+               return -1;
+       }
+
+       for (p = servinfo; p != NULL; p = p->ai_next) {
+               if ((server->listening_fd = socket(AF_INET, SOCK_STREAM, 0)) ==
+                               -1) {
+                       ERROR("Failed to create socket");
+                       continue;
+               }
+
+               if (setsockopt(server->listening_fd, SOL_SOCKET, SO_REUSEADDR,
+                                       &yes, sizeof(int)) == -1) {
+                       ERROR("Failed to setsockopt");
+                       close(server->listening_fd);
+                       return -1;
+               }
+
+               if (bind(server->listening_fd, p->ai_addr, p->ai_addrlen) ==
+                               -1) {
+                       ERROR("Failed to bind");
+                       close(server->listening_fd);
+                       continue;
+               }
+
+               break;
+       }
+
+       if (p == NULL) {
+               ERROR("Failed to bind");
+               close(server->listening_fd);
+               return -1;
+       }
+
+       freeaddrinfo(servinfo);
+
+       if (listen(server->listening_fd, 10) == -1) {
+               ERROR("listen");
+               return -1;
+       }
+
+       tapdisk_nbdserver_unpause(server);
+
+       if (server->listening_event_id < 0) {
+               err = server->listening_event_id;
+               close(server->listening_fd);
+               return -1;
+       }
+
+       INFO("Successfully started NBD server");
+
+       return 0;
+}
+
+void
+tapdisk_nbdserver_pause(td_nbdserver_t *server)
+{
+       struct td_nbdserver_client *pos, *q;
+
+       INFO("NBD server pause(%p)", server);
+
+    TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q) {
+               if (pos->paused != 1 && pos->client_event_id >= 0) {
+                       tapdisk_nbdserver_disable_client(pos);
+                       pos->paused = 1;
+               }
+       }
+
+       if (server->listening_event_id >= 0)
+               tapdisk_server_unregister_event(server->listening_event_id);
+}
+
+int
+tapdisk_nbdserver_unpause(td_nbdserver_t *server)
+{
+       struct td_nbdserver_client *pos, *q;
+
+       INFO("NBD server unpause(%p) - listening_fd = %d", server,
+                       server->listening_fd);
+
+    TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q) {
+               if (pos->paused == 1) {
+                       tapdisk_nbdserver_enable_client(pos);
+                       pos->paused = 0;
+               }
+       }
+
+       if (server->listening_event_id < 0 && server->listening_fd >= 0) {
+               server->listening_event_id =
+                       tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+                                       server->listening_fd, 0,
+                                       tapdisk_nbdserver_newclient,
+                                       server);
+               INFO("registering for listening_fd");
+       }
+
+       return server->listening_event_id;
+}
+
+void
+tapdisk_nbdserver_free(td_nbdserver_t *server)
+{
+       struct td_nbdserver_client *pos, *q;
+
+       INFO("NBD server free(%p)", server);
+
+    TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q)
+               tapdisk_nbdserver_free_client(pos);
+
+       if (server->listening_event_id >= 0) {
+               tapdisk_server_unregister_event(server->listening_event_id);
+               server->listening_event_id = -1;
+       }
+
+       if (server->listening_fd >= 0) {
+               close(server->listening_fd);
+               server->listening_fd = -1;
+       }
+
+       if (server->fdreceiver)
+               td_fdreceiver_stop(server->fdreceiver);
+
+       free(server);
+}
diff --git a/tools/blktap3/drivers/tapdisk-nbdserver.h 
b/tools/blktap3/drivers/tapdisk-nbdserver.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-nbdserver.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2012, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _TAPDISK_NBDSERVER_H_
+#define _TAPDISK_NBDSERVER_H_
+
+typedef struct td_nbdserver td_nbdserver_t;
+typedef struct td_nbdserver_req td_nbdserver_req_t;
+typedef struct td_nbdserver_client td_nbdserver_client_t;
+
+#include "blktap3.h"
+#include "tapdisk-vbd.h"
+
+TAILQ_HEAD(tqh_td_nbdserver_client, td_nbdserver_client);
+
+struct td_nbdserver {
+       td_vbd_t               *vbd;
+       td_disk_info_t          info;
+
+       int                     listening_fd;
+       int                     listening_event_id;
+
+       struct td_fdreceiver   *fdreceiver;
+
+    /**
+     * list of td_nbdserver_client
+     */
+    struct tqh_td_nbdserver_client clients;
+};
+
+struct td_nbdserver_client {
+       int                     n_reqs;
+       td_nbdserver_req_t     *reqs;
+       struct td_iovec        *iovecs;
+       int                     n_reqs_free;
+       td_nbdserver_req_t    **reqs_free;
+
+       int                     client_fd;
+       int                     client_event_id;
+
+       td_nbdserver_t         *server;
+
+    /**
+     * for linked lists
+     */
+    TAILQ_ENTRY(td_nbdserver_client) clientlist; /* TODO rename to entry */
+
+       int                     paused;
+};
+
+td_nbdserver_t *tapdisk_nbdserver_alloc(td_vbd_t *, td_disk_info_t);
+int tapdisk_nbdserver_listen(td_nbdserver_t *, int);
+void tapdisk_nbdserver_free(td_nbdserver_t *);
+void tapdisk_nbdserver_pause(td_nbdserver_t *);
+int tapdisk_nbdserver_unpause(td_nbdserver_t *);
+
+#endif /* _TAPDISK_NBDSERVER_H_ */
diff --git a/tools/blktap3/drivers/tapdisk-queue.c 
b/tools/blktap3/drivers/tapdisk-queue.c
--- a/tools/blktap3/drivers/tapdisk-queue.c
+++ b/tools/blktap3/drivers/tapdisk-queue.c
@@ -347,7 +347,7 @@ static int __lio_setup_aio_eventfd(struc
 static int tapdisk_lio_setup_aio(struct tqueue *queue, int qlen)
 {
        struct lio *lio = queue->tio_data;
-       int err;
+       int err, old_err = 0;
 
        lio->aio_ctx  =  0;
        lio->event_fd = -1;
@@ -359,17 +359,20 @@ static int tapdisk_lio_setup_aio(struct 
 
        err = !tapdisk_lio_check_resfd();
        if (!err)
-               err = __lio_setup_aio_eventfd(queue, qlen);
+               err = old_err = __lio_setup_aio_eventfd(queue, qlen);
        if (err)
                err = __lio_setup_aio_poll(queue, qlen);
 
-       if (err == -EAGAIN)
+       /* __lio_setup_aio_poll seems to always fail with EINVAL on newer 
systems,
+        * probably because it initializes the output parameter of io_setup to a
+        * non-zero value and the kernel patch that understands this is missing 
*/
+       if (err == -EAGAIN || (err && old_err == -EAGAIN))
                goto fail_rsv;
 fail:
        return err;
 
 fail_rsv:
-       DPRINTF("Couldn't setup AIO context. If you are trying to "
+       EPRINTF("Couldn't setup AIO context. If you are trying to "
                "concurrently use a large number of blktap-based disks, you may 
"
                "need to increase the system-wide aio request limit. "
                "(e.g. 'echo 1048576 > /proc/sys/fs/aio-max-nr')\n");
diff --git a/tools/blktap3/drivers/tapdisk-stats.c 
b/tools/blktap3/drivers/tapdisk-stats.c
--- a/tools/blktap3/drivers/tapdisk-stats.c
+++ b/tools/blktap3/drivers/tapdisk-stats.c
@@ -29,6 +29,8 @@
 
 #include <stdio.h>
 #include <stdarg.h>
+#include <stdlib.h>
+#include <errno.h>
 
 #include "tapdisk.h"
 #include "tapdisk-stats.h"
@@ -38,8 +40,28 @@
 
 static void __stats_vsprintf(td_stats_t * st, const char *fmt, va_list ap)
 {
-    size_t size = st->buf + st->size - st->pos;
-    st->pos += vsnprintf(st->pos, size, fmt, ap);
+       void *buf;
+       int written, new_size, off;
+       size_t size = 0;
+       written = 1;
+       while (written > size) {
+               size = st->buf + st->size - st->pos;
+               written = vsnprintf(st->pos, size, fmt, ap);
+               if (written <= size)
+                       break;
+               new_size = st->size * 2;
+               buf = realloc(st->buf, new_size);
+               if (!buf) {
+                       st->err = -ENOMEM;
+                       written = size;
+                       break;
+               }
+               off = st->pos - st->buf;
+               st->buf = buf;
+               st->size = new_size;
+               st->pos = st->buf + off;
+       }
+       st->pos += written;
 }
 
 static void __printf(2, 3)
diff --git a/tools/blktap3/drivers/tapdisk-stats.h 
b/tools/blktap3/drivers/tapdisk-stats.h
--- a/tools/blktap3/drivers/tapdisk-stats.h
+++ b/tools/blktap3/drivers/tapdisk-stats.h
@@ -42,6 +42,7 @@ struct tapdisk_stats_ctx {
 
     int n_elem[TD_STATS_MAX_DEPTH];
     int depth;
+       int             err;
 };
 
 typedef struct tapdisk_stats_ctx td_stats_t;
@@ -58,6 +59,9 @@ tapdisk_stats_init(td_stats_t * st, char
 
 static inline size_t tapdisk_stats_length(td_stats_t * st)
 {
+       if (st->err)
+               return st->err;
+
     return st->pos - st->buf;
 }
 
diff --git a/tools/blktap3/drivers/tapdisk-stream.c 
b/tools/blktap3/drivers/tapdisk-stream.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/tapdisk-stream.c
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "list.h"
+#include "scheduler.h"
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-disktype.h"
+
+#define POLL_READ                        0
+#define POLL_WRITE                       1
+
+#define MIN(a, b)                        ((a) < (b) ? (a) : (b))
+#define BUG(_cond)                       td_panic()
+#define BUG_ON(_cond)                    if (unlikely(_cond)) { td_panic(); }
+
+#define TD_STREAM_MAX_REQS               16
+#define TD_STREAM_REQ_SIZE               (sysconf(_SC_PAGE_SIZE) * 32)
+
+typedef struct tapdisk_stream_request td_stream_req_t;
+typedef struct tapdisk_stream td_stream_t;
+
+struct tapdisk_stream_request {
+       void                            *buf;
+       struct td_iovec                  iov;
+       td_vbd_request_t                 vreq;
+       struct list_head                 entry;
+};
+
+struct tapdisk_stream {
+       td_vbd_t                        *vbd;
+
+       unsigned int                     id;
+       int                              in_fd;
+       int                              out_fd;
+
+       int                              err;
+
+       td_sector_t                      sec_in;
+       td_sector_t                      sec_out;
+       uint64_t                         count;
+
+       struct list_head                 pending_list;
+       struct list_head                 completed_list;
+
+       td_stream_req_t                  reqs[TD_STREAM_MAX_REQS];
+       td_stream_req_t                 *free[TD_STREAM_MAX_REQS];
+       int                              n_free;
+};
+
+static unsigned int tapdisk_stream_count;
+
+static void tapdisk_stream_close_image(td_stream_t *);
+static void tapdisk_stream_queue_requests(td_stream_t *);
+
+static void
+usage(const char *app, int err)
+{
+       printf("usage: %s <-n type:/path/to/image> "
+              "[-c sector count] [-s skip sectors]\n", app);
+       exit(err);
+}
+
+static inline int
+tapdisk_stream_stop(td_stream_t *s)
+{
+       return (list_empty(&s->pending_list) && (!s->count || s->err));
+}
+
+static int
+tapdisk_stream_req_create(td_stream_req_t *req)
+{
+       int prot, flags;
+
+       memset(req, 0, sizeof(*req));
+       INIT_LIST_HEAD(&req->entry);
+
+       prot  = PROT_READ|PROT_WRITE;
+       flags = MAP_ANONYMOUS|MAP_PRIVATE;
+
+       req->buf = mmap(NULL, TD_STREAM_REQ_SIZE, prot, flags, -1, 0);
+       if (req->buf == MAP_FAILED) {
+               req->buf = NULL;
+               return -errno;
+       }
+
+       return 0;
+}
+
+static void
+tapdisk_stream_req_destroy(td_stream_req_t *req)
+{
+       if (req->buf) {
+               int err = munmap(req->iov.base, TD_STREAM_REQ_SIZE);
+               BUG_ON(err);
+               req->iov.base = NULL;
+       }
+}
+
+td_stream_req_t *
+tapdisk_stream_alloc_req(td_stream_t *s)
+{
+       td_stream_req_t *req = NULL;
+
+       if (likely(s->n_free))
+               req = s->free[--s->n_free];
+
+       return req;
+}
+
+void
+tapdisk_stream_free_req(td_stream_t *s, td_stream_req_t *req)
+{
+       BUG_ON(s->n_free >= MAX_REQUESTS);
+       BUG_ON(!list_empty(&req->entry));
+       s->free[s->n_free++] = req;
+}
+
+static void
+tapdisk_stream_destroy_reqs(td_stream_t *s)
+{
+       td_stream_req_t *req;
+
+       do {
+               req = tapdisk_stream_alloc_req(s);
+               if (!req)
+                       break;
+
+               tapdisk_stream_req_destroy(req);
+       } while (1);
+}
+
+static int
+tapdisk_stream_create_reqs(td_stream_t *s)
+{
+       int i, err;
+
+       s->n_free = 0;
+
+       for (i = 0; i < TD_STREAM_MAX_REQS; i++) {
+               td_stream_req_t *req = &s->reqs[i];
+
+               err = tapdisk_stream_req_create(req);
+               if (err)
+                       goto fail;
+
+               tapdisk_stream_free_req(s, req);
+       }
+
+       return 0;
+
+fail:
+       tapdisk_stream_destroy_reqs(s);
+       return err;
+}
+
+static int
+tapdisk_stream_print_request(td_stream_t *s, td_stream_req_t *req)
+{
+       struct td_iovec *iov = &req->iov;
+
+       int gcc = write(s->out_fd, iov->base, iov->secs << SECTOR_SHIFT);
+       if (gcc) {};
+
+       return iov->secs;
+}
+
+static void
+tapdisk_stream_write_data(td_stream_t *s)
+{
+       td_stream_req_t *req, *next;
+
+       list_for_each_entry_safe(req, next, &s->completed_list, entry) {
+               if (req->vreq.sec != s->sec_out)
+                       break;
+
+               s->sec_out += tapdisk_stream_print_request(s, req);
+
+               list_del_init(&req->entry);
+               tapdisk_stream_free_req(s, req);
+       }
+}
+
+static inline void
+tapdisk_stream_queue_completed(td_stream_t *s, td_stream_req_t *req)
+{
+       td_stream_req_t *itr;
+
+       list_for_each_entry(itr, &s->completed_list, entry)
+               if (req->vreq.sec < itr->vreq.sec)
+                       break;
+
+       list_add_tail(&req->entry, &itr->entry);
+}
+
+static void
+tapdisk_stream_complete_request(td_stream_t *s, td_stream_req_t *req, 
+                               int error, int final)
+{
+       list_del_init(&req->entry);
+
+       if (likely(!error))
+               tapdisk_stream_queue_completed(s, req);
+       else {
+               s->err = EIO;
+               tapdisk_stream_free_req(s, req);
+               fprintf(stderr, "error reading sector 0x%"PRIx64"\n",
+                       req->vreq.sec);
+       }
+
+       if (!final)
+               return;
+
+       tapdisk_stream_write_data(s);
+
+       if (tapdisk_stream_stop(s)) {
+               tapdisk_stream_close_image(s);
+               return;
+       }
+
+       tapdisk_stream_queue_requests(s);
+}
+
+static void
+__tapdisk_stream_request_cb(td_vbd_request_t *vreq, int error,
+                           void *token, int final)
+{
+       td_stream_req_t *req = containerof(vreq, td_stream_req_t, vreq);
+       td_stream_t *s = token;
+
+       tapdisk_stream_complete_request(s, req, error, final);
+}
+
+static void
+tapdisk_stream_queue_request(td_stream_t *s, td_stream_req_t *req)
+{
+       td_vbd_request_t *vreq;
+       struct td_iovec *iov;
+       int secs, err;
+
+       iov   = &req->iov;
+       secs  = MIN(TD_STREAM_REQ_SIZE >> SECTOR_SHIFT, s->count);
+
+       iov->base           = req->buf;
+       iov->secs           = secs;
+
+       vreq                = &req->vreq;
+       vreq->iov           = iov;
+       vreq->iovcnt        = 1;
+       vreq->sec           = s->sec_in;
+       vreq->op            = TD_OP_READ;
+       vreq->name          = NULL;
+       vreq->token         = s;
+       vreq->cb            = __tapdisk_stream_request_cb;
+
+       s->count  -= secs;
+       s->sec_in += secs;
+
+       err = tapdisk_vbd_queue_request(s->vbd, vreq);
+       if (err)
+               tapdisk_stream_complete_request(s, req, err, 1);
+
+       list_add_tail(&req->entry, &s->pending_list);
+}
+
+static void
+tapdisk_stream_queue_requests(td_stream_t *s)
+{
+
+       while (s->count && !s->err) {
+               td_stream_req_t *req;
+
+               req = tapdisk_stream_alloc_req(s);
+               if (!req)
+                       break;
+
+               tapdisk_stream_queue_request(s, req);
+       }
+}
+
+static int
+tapdisk_stream_open_image(struct tapdisk_stream *s, const char *name)
+{
+       int err;
+
+       s->id = tapdisk_stream_count++;
+
+       err = tapdisk_server_initialize(NULL, NULL);
+       if (err)
+               goto out;
+
+       err = tapdisk_vbd_initialize(-1, -1, s->id);
+       if (err)
+               goto out;
+
+       s->vbd = tapdisk_server_get_vbd(s->id);
+       if (!s->vbd) {
+               err = ENODEV;
+               goto out;
+       }
+
+       err = tapdisk_vbd_open_vdi(s->vbd, name, TD_OPEN_RDONLY, -1);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       if (err)
+               fprintf(stderr, "failed to open %s: %d\n", name, err);
+       return err;
+}
+
+static void
+tapdisk_stream_close_image(td_stream_t *s)
+{
+       td_vbd_t *vbd;
+
+       vbd = tapdisk_server_get_vbd(s->id);
+       if (vbd) {
+               tapdisk_vbd_close_vdi(vbd);
+               tapdisk_server_remove_vbd(vbd);
+               free(vbd->name);
+               free(vbd);
+               s->vbd = NULL;
+       }
+}
+
+static int
+tapdisk_stream_set_position(td_stream_t *s,
+                           uint64_t count, uint64_t skip)
+{
+       int err;
+       td_disk_info_t info;
+
+       err = tapdisk_vbd_get_disk_info(s->vbd, &info);
+       if (err) {
+               fprintf(stderr, "failed getting image size: %d\n", err);
+               return err;
+       }
+
+       if (count == -1LL)
+               count = info.size - skip;
+
+       if (count + skip > info.size) {
+               fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n",
+                       count + skip, info.size);
+               return -EINVAL;
+       }
+
+       s->sec_in  = skip;
+       s->sec_out = skip;
+       s->count   = count;
+
+       return 0;
+}
+
+void
+__tapdisk_stream_event_cb(event_id_t id, char mode, void *arg)
+{
+}
+
+static int
+tapdisk_stream_open_fds(struct tapdisk_stream *s)
+{
+       s->out_fd = dup(STDOUT_FILENO);
+       if (s->out_fd == -1) {
+               fprintf(stderr, "failed to open output: %d\n", errno);
+               return errno;
+       }
+
+       return 0;
+}
+
+static void
+tapdisk_stream_close(struct tapdisk_stream *s)
+{
+       tapdisk_stream_destroy_reqs(s);
+
+       tapdisk_stream_close_image(s);
+
+       if (s->out_fd >= 0) {
+               close(s->out_fd);
+               s->out_fd = -1;
+       }
+}
+
+static int
+tapdisk_stream_open(struct tapdisk_stream *s, const char *name,
+                   uint64_t count, uint64_t skip)
+{
+       int err = 0;
+
+       memset(s, 0, sizeof(*s));
+       s->in_fd = s->out_fd = -1;
+       INIT_LIST_HEAD(&s->pending_list);
+       INIT_LIST_HEAD(&s->completed_list);
+
+       if (!err)
+               err = tapdisk_stream_open_fds(s);
+       if (!err)
+               err = tapdisk_stream_open_image(s, name);
+       if (!err)
+               err = tapdisk_stream_set_position(s, count, skip);
+       if (!err)
+               err = tapdisk_stream_create_reqs(s);
+
+       if (err)
+               tapdisk_stream_close(s);
+
+       return err;
+}
+
+static int
+tapdisk_stream_run(struct tapdisk_stream *s)
+{
+       tapdisk_stream_queue_requests(s);
+       tapdisk_server_run();
+       return s->err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int c, err;
+       const char *params;
+       uint64_t count, skip;
+       struct tapdisk_stream stream;
+
+       err    = 0;
+       skip   = 0;
+       count  = (uint64_t)-1;
+       params = NULL;
+
+       while ((c = getopt(argc, argv, "n:c:s:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       params = optarg;
+                       break;
+               case 'c':
+                       count = strtoull(optarg, NULL, 10);
+                       break;
+               case 's':
+                       skip = strtoull(optarg, NULL, 10);
+                       break;
+               default:
+                       err = EINVAL;
+               case 'h':
+                       usage(argv[0], err);
+               }
+       }
+
+       if (!params)
+               usage(argv[0], EINVAL);
+
+       tapdisk_start_logging("tapdisk-stream", "daemon");
+
+       err = tapdisk_stream_open(&stream, params, count, skip);
+       if (err)
+               goto out;
+
+       err = tapdisk_stream_run(&stream);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       tapdisk_stream_close(&stream);
+       tapdisk_stop_logging();
+       return err;
+}
diff --git a/tools/blktap3/drivers/tapdisk-syslog.c 
b/tools/blktap3/drivers/tapdisk-syslog.c
--- a/tools/blktap3/drivers/tapdisk-syslog.c
+++ b/tools/blktap3/drivers/tapdisk-syslog.c
@@ -231,7 +231,7 @@ static void tapdisk_syslog_ring_warning(
     n = log->oom;
     log->oom = 0;
 
-    err = tapdisk_syslog(log, LOG_WARNING,
+       err = tapdisk_syslog(log, TLOG_WARN,
                          "tapdisk-syslog: %d messages dropped", n);
     if (err)
         log->oom = n;
diff --git a/tools/blktap3/drivers/tapdisk-utils.c 
b/tools/blktap3/drivers/tapdisk-utils.c
--- a/tools/blktap3/drivers/tapdisk-utils.c
+++ b/tools/blktap3/drivers/tapdisk-utils.c
@@ -37,6 +37,8 @@
 #include <sys/ioctl.h>
 #include <sys/resource.h>
 #include <sys/utsname.h>
+#include <arpa/inet.h>
+
 #ifdef __linux__
 #include <linux/version.h>
 #endif
@@ -181,6 +183,7 @@ tapdisk_namedup(char **dup, const char *
        return 0;
 }
 
+/* FIXME Is this still used? */
 /*Get Image size, secsize*/
 int
 tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size)
@@ -264,3 +267,19 @@ int tapdisk_linux_version(void)
 }
 
 #endif
+
+#ifdef WORDS_BIGENDIAN
+uint64_t ntohll(uint64_t a) {
+       return a;
+}
+#else
+uint64_t ntohll(uint64_t a) {
+       uint32_t lo = a & 0xffffffff;
+       uint32_t hi = a >> 32U;
+       lo = ntohl(lo);
+       hi = ntohl(hi);
+       return ((uint64_t) lo) << 32U | hi;
+}
+#endif
+#define htonll ntohll
+
diff --git a/tools/blktap3/drivers/tapdisk-utils.h 
b/tools/blktap3/drivers/tapdisk-utils.h
--- a/tools/blktap3/drivers/tapdisk-utils.h
+++ b/tools/blktap3/drivers/tapdisk-utils.h
@@ -45,5 +45,7 @@ int tapdisk_namedup(char **, const char 
 int tapdisk_parse_disk_type(const char *, char **, int *);
 int tapdisk_get_image_size(int, uint64_t *, uint32_t *);
 int tapdisk_linux_version(void);
+uint64_t ntohll(uint64_t);
+#define htonll ntohll
 
 #endif
diff --git a/tools/blktap3/drivers/tapdisk-vbd.c 
b/tools/blktap3/drivers/tapdisk-vbd.c
--- a/tools/blktap3/drivers/tapdisk-vbd.c
+++ b/tools/blktap3/drivers/tapdisk-vbd.c
@@ -48,10 +48,14 @@
 #include "tapdisk-stats.h"
 #include "sring/td-stats.h"
 #include "tapdisk-storage.h"
+#include "tapdisk-nbdserver.h"
 
 #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
 #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
 
+#define INFO(_f, _a...)            tlog_syslog(TLOG_INFO, "vbd: " _f, ##_a)
+#define ERROR(_f, _a...)           tlog_syslog(TLOG_WARN, "vbd: " _f, ##_a)
+
 #if 1
 #define ASSERT(p)                                                      \
        do {                                                            \
@@ -65,7 +69,6 @@
 #define ASSERT(p) ((void)0)
 #endif
 
-
 #define TD_VBD_EIO_RETRIES          10
 #define TD_VBD_EIO_SLEEP            1
 #define TD_VBD_WATCHDOG_TIMEOUT     10
@@ -261,6 +264,14 @@ static int tapdisk_vbd_add_secondary(td_
     const char *path;
        int type, err;
 
+       if (strcmp(vbd->secondary_name, "null") == 0) {
+               DPRINTF("Removing secondary image\n");
+               vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED;
+               vbd->secondary = NULL;
+               vbd->nbd_mirror_failed = 0;
+               return 0;
+       }
+
     DPRINTF("Adding secondary image: %s\n", vbd->secondary_name);
 
     type = tapdisk_disktype_parse_params(vbd->secondary_name, &path);
@@ -274,8 +285,15 @@ static int tapdisk_vbd_add_secondary(td_
     }
 
     err = tapdisk_image_open(type, path, leaf->flags, &second);
-               if (err)
+       if (err) {
+               if (type == DISK_TYPE_NBD)
+                       vbd->nbd_mirror_failed = 1;
+
+               vbd->secondary=NULL;
+               vbd->secondary_mode=TD_VBD_SECONDARY_DISABLED;
+               
         goto fail;
+       }
 
     if (second->info.size != leaf->info.size) {
         EPRINTF("Secondary image size %" PRIu64 " != image size %" PRIu64
@@ -465,8 +483,12 @@ tapdisk_vbd_open_vdi(td_vbd_t * vbd, con
 
     if (td_flag_test(vbd->flags, TD_OPEN_SECONDARY)) {
         err = tapdisk_vbd_add_secondary(vbd);
-        if (err)
+               if (err) {
+                       if (vbd->nbd_mirror_failed != 1)
             goto fail;
+                       INFO("Ignoring failed NBD secondary attach\n");
+                       err = 0;
+               }
     }
 
     if (tmp != vbd->name)
@@ -631,6 +653,12 @@ int tapdisk_vbd_retry_needed(td_vbd_t * 
              TAILQ_EMPTY(&vbd->new_requests));
 }
 
+int
+tapdisk_vbd_lock(td_vbd_t *vbd __attribute__((unused)))
+{
+       return 0;
+}
+
 int tapdisk_vbd_quiesce_queue(td_vbd_t * vbd)
 {
     if (!TAILQ_EMPTY(&vbd->pending_requests)) {
@@ -686,17 +714,20 @@ int tapdisk_vbd_pause(td_vbd_t * vbd)
 {
        int err;
 
-    DBG(TLOG_DBG, "pause requested\n");
+       INFO("pause requested\n");
 
        td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
 
+       if (vbd->nbdserver)
+               tapdisk_nbdserver_pause(vbd->nbdserver);
+
        err = tapdisk_vbd_quiesce_queue(vbd);
        if (err)
                return err;
 
        tapdisk_vbd_close_vdi(vbd);
 
-    DBG(TLOG_DBG, "pause completed\n");
+       INFO("pause completed\n");
 
        td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
        td_flag_set(vbd->state, TD_VBD_PAUSED);
@@ -716,8 +747,8 @@ int tapdisk_vbd_resume(td_vbd_t * vbd, c
        }
 
        for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
-        err =
-            tapdisk_vbd_open_vdi(vbd, name, vbd->flags | TD_OPEN_STRICT, NULL);
+               err = tapdisk_vbd_open_vdi(vbd, name, vbd->flags | 
TD_OPEN_STRICT,
+                NULL);
         if (!err)
                        break;
 
@@ -734,6 +765,9 @@ int tapdisk_vbd_resume(td_vbd_t * vbd, c
        td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
        tapdisk_vbd_check_state(vbd);
 
+       if (vbd->nbdserver)
+               tapdisk_nbdserver_unpause(vbd->nbdserver);
+
     DBG(TLOG_DBG, "state checked\n");
 
        return 0;
@@ -902,10 +936,10 @@ static void
                if (err != -EBUSY) {
             if (!vreq->error && err != vreq->prev_error)
                 tlog_drv_error(image->driver, err,
-                               "req %s: %s 0x%04x secs @ 0x%08" PRIx64,
+                                              "req %s: %s 0x%04x secs @ 
0x%08"PRIx64" - %s",
                                vreq->name,
                            (treq.op == TD_OP_WRITE ? "write" : "read"),
-                           treq.secs, treq.sec);
+                                              treq.secs, treq.sec, 
strerror(abs(err)));
             vbd->errors++;
                }
         vreq->error = (vreq->error ? : err);
@@ -1020,6 +1054,26 @@ void tapdisk_vbd_complete_td_request(td_
         }
     }
 
+       if (res != 0)
+               DPRINTF("Res=%d, image->type=%d\n", res, image->type);
+
+       if (res != 0 && image->type == DISK_TYPE_NBD && 
+                       ((image == vbd->secondary) || 
+                        (image == vbd->retired))) {
+               ERROR("Got non-zero res for NBD secondary - disabling "
+                               "mirroring: %s",vreq->name);
+               vbd->nbd_mirror_failed = 1;
+               res = 0; /* Pretend the writes have completed successfully */
+
+               /* It was the secondary that timed out - disable secondary */
+        TAILQ_REMOVE(&vbd->images, image, entry);
+               vbd->retired = image;
+               if (vbd->secondary_mode != TD_VBD_SECONDARY_DISABLED) {
+                       vbd->secondary = NULL;
+                       vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED;
+               }
+       }
+
     DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08" PRIx64
            "secs 0x%04x buf %p op %d res %d\n", image->name,
         vreq->name, treq.sidx, treq.sec, treq.secs,
@@ -1297,6 +1351,27 @@ void tapdisk_vbd_kick(td_vbd_t * vbd)
        }
 }
 
+int
+tapdisk_vbd_start_nbdserver(td_vbd_t *vbd)
+{
+       td_disk_info_t info;
+       int err;
+
+       err = tapdisk_vbd_get_disk_info(vbd, &info);
+
+       if (err)
+               return err;
+
+       vbd->nbdserver = tapdisk_nbdserver_alloc(vbd, info);
+
+       if (!vbd->nbdserver) {
+               EPRINTF("Error starting nbd server");
+               return -1;
+       }
+
+       return 0;
+}
+
 void tapdisk_vbd_stats(td_vbd_t * vbd, td_stats_t * st)
 {
     td_image_t *image, *next;
@@ -1324,5 +1399,9 @@ void tapdisk_vbd_stats(td_vbd_t * vbd, t
                         "FIXME_enospc_redirect_count",
                         "llu", vbd->FIXME_enospc_redirect_count);
 
+       tapdisk_stats_field(st,
+                       "nbd_mirror_failed",
+                       "d", vbd->nbd_mirror_failed);
+
     tapdisk_stats_leave(st, '}');
 }
diff --git a/tools/blktap3/drivers/tapdisk-vbd.h 
b/tools/blktap3/drivers/tapdisk-vbd.h
--- a/tools/blktap3/drivers/tapdisk-vbd.h
+++ b/tools/blktap3/drivers/tapdisk-vbd.h
@@ -55,6 +55,8 @@
 
 TAILQ_HEAD(tqh_td_vbd_handle, td_vbd_handle);
 
+struct td_nbdserver;
+
 struct td_vbd_handle {
     /**
      * type:/path/to/file
@@ -80,13 +82,16 @@ struct td_vbd_handle {
        int                         FIXME_enospc_redirect_count_enabled;
        uint64_t                    FIXME_enospc_redirect_count;
 
-       /* when we encounter ENOSPC on the primary leaf image in mirror mode, 
+       /** 
+        * when we encounter ENOSPC on the primary leaf image in mirror mode, 
         * we need to remove it from the VBD chain so that writes start going 
         * on the secondary leaf. However, we cannot free the image at that 
         * time since it might still have in-flight treqs referencing it.  
         * Therefore, we move it into 'retired' until shutdown. */
        td_image_t                 *retired;
 
+       int                         nbd_mirror_failed;
+
        struct tqh_td_vbd_request   new_requests;
        struct tqh_td_vbd_request   pending_requests;
        struct tqh_td_vbd_request   failed_requests;
@@ -105,6 +110,8 @@ struct td_vbd_handle {
        uint64_t                    retries;
        uint64_t                    errors;
        td_sector_count_t           secs;
+
+       struct td_nbdserver        *nbdserver;
 };
 
 #define tapdisk_vbd_for_each_request(vreq, tmp, list)                  \
@@ -217,6 +224,7 @@ void tapdisk_vbd_check_state(td_vbd_t *)
 int tapdisk_vbd_recheck_state(td_vbd_t *);
 void tapdisk_vbd_check_progress(td_vbd_t *);
 void tapdisk_vbd_debug(td_vbd_t *);
+int tapdisk_vbd_start_nbdserver(td_vbd_t *);
 void tapdisk_vbd_stats(td_vbd_t *, td_stats_t *);
 
 #endif
diff --git a/tools/blktap3/drivers/tapdisk.c b/tools/blktap3/drivers/tapdisk.c
--- a/tools/blktap3/drivers/tapdisk.c
+++ b/tools/blktap3/drivers/tapdisk.c
@@ -37,6 +37,9 @@
 #include "tapdisk-server.h"
 #include "tapdisk-control.h"
 
+void tdnbd_fdreceiver_start(void);
+void tdnbd_fdreceiver_stop(void);
+
 static void usage(const char *app, int err)
 {
     fprintf(stderr, "usage: %s <-u uuid> <-c control socket>\n", app);
@@ -131,9 +134,17 @@ int main(int argc, char *argv[])
     fprintf(out, "%s\n", control);
     fclose(out);
 
+    /*
+     * NB: We're unconditionally starting the FD receiver here - this is 
+     * for the block-nbd driver. In the future we may want to start this as 
+     * a response to a tap-ctl message
+     */
+    tdnbd_fdreceiver_start();
+
     err = tapdisk_server_run();
 
   out:
+    tdnbd_fdreceiver_stop();
     tapdisk_control_close();
     tapdisk_stop_logging();
     return -err;
diff --git a/tools/blktap3/drivers/tapdisk.h b/tools/blktap3/drivers/tapdisk.h
--- a/tools/blktap3/drivers/tapdisk.h
+++ b/tools/blktap3/drivers/tapdisk.h
@@ -60,11 +60,8 @@
 #include <stdint.h>
 #include <assert.h>
 
-// XXX?
-//#include "blktaplib.h"
 #include "blktap3.h"
 
-// TODO necessary?
 #include "tapdisk-log.h"
 #include "tapdisk-utils.h"
 #include "tapdisk-stats.h"
@@ -208,6 +205,8 @@ struct td_request {
     td_vbd_request_t            *vreq;
 };
 
+struct tqh_td_image_handle;
+
 /* 
  * Structure describing the interface to a virtual disk implementation.
  * See note at the top of this file describing this interface.
@@ -217,7 +216,7 @@ struct tap_disk {
        td_flag_t                    flags;
        int                          private_data_size;
        int (*td_open)               (td_driver_t *, const char *, td_flag_t);
-       int (*td_close)              (td_driver_t *);
+       int (*td_close)              (td_driver_t *, struct tqh_td_image_handle 
*);
        int (*td_get_parent_id)      (td_driver_t *, td_disk_id_t *);
        int (*td_validate_parent)    (td_driver_t *, td_driver_t *, td_flag_t);
        void (*td_queue_read)        (td_driver_t *, td_request_t);
diff --git a/tools/blktap3/drivers/td-rated.1.txt 
b/tools/blktap3/drivers/td-rated.1.txt
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/td-rated.1.txt
@@ -0,0 +1,190 @@
+
+SYNOPSIS
+
+    td-rated <name> -type {token|leaky|meminfo} -- [options]
+
+DESCRIPTION
+
+    The td-rated 'bridge' is a daemon program to which one or a number
+    of tapdisk processes connect, in order to cooperatively limit the
+    data rate at which they will issue I/O requests to physical
+    storage.
+
+    A data rate denotes I/O bandwidth, i.e. an (average) amount of
+    data over time. A rate limiter is a state machine dispatching an
+    overall queue of incoming I/O requests, at a desired data rate.
+
+    The td-rated program included a number of alternative rate
+    limiting algorithms for various purposes. Rate limiters are
+    discussed below.
+
+    The standard client implementation in tapdisk is a transparent
+    filter driver, of type name 'valve'. Valves are typically inserted
+    at either the top of certain level of the disk image stack
+    constituting a VDI, thereby uniformly limiting any I/O issued.
+
+    Every bridge process constitutes a single rate limiter. Arbitrary
+    numbers of client valves can connect to each bridge. I/O requests
+    issued by clients are normally aggregated, dividing the available
+    bandwidth among all active clients.
+
+OPTIONS
+
+    Token Bucket
+
+       Token bucket is a rate limiter which drains a request queue of
+       pending I/O requests at a given overall data rate. It is
+       invoked as follows:
+
+       td-rated -t token -- ..
+
+       --rate <limit>
+               Bandwidth limit [B/s].
+
+       --cap <limit>
+               Burst (aggregated credit) limit [B].
+
+       Token bucket's main feature over basic constant-rate
+       algorithms (leaky buckets) is that it allows for I/O
+       bursts. Bursts are batches of data request, which are
+       preferably issued simultaneously to reduce the overall number
+       of seeks involved on shared rotational media. 
+
+       With bursty I/O transfers, bandwidth may transiently exceed
+       the nominal data rate, but in a controlled fashion. Different
+       from a constant rate output, the I/O output rate is maintained
+       as an average over periods of time.
+
+       Internally, bursts issued at any time instant consume
+       bandwidth credit ('tokens'). Credit gets accumulated, at the
+       given rate, over time. Once exhausted, credit taken must be
+       amortized before additional I/O can pass. That is, while the
+       rate set will limit an output data rate, it does so only
+       indirectly, by limiting the rate at which new credit is
+       assigned.
+
+       The cap argument is a limit to accumulated credit. Excess
+       credit above the given capacity will be discarded. Caps limit
+       the maximum burst size observable. The maximum only becomes
+       available whenever all clients remained idle for for a time
+       perid of cap/rate.
+
+       A token bucket allows for bursts, it does not promote or
+       enforce them at. Once configured bandwidth credit is exeeded,
+       amortization time is applied to client request batches
+       individually, in the order in which they were issued, and
+       output will effectively degrade to a constant data rate.
+ 
+    Leaky Bucket
+
+       Leaky bucket is a simpler constant rate algorithm. Requests
+       are issued in a round-robin fashion. The given rate is never
+       exceeded, so requests.
+
+       This is presently equivalent to a token bucket with a cap
+       value of zero, and therefore implemented accordingly.
+
+       td-rated -t leaky -- ..
+
+        --rate <limit>
+               Bandwidth limit [B/s].
+
+    Meminfo Driver
+
+       Meminfo is an experimental rate limiting driver aiming
+       specifically at write bandwidth reduction for tapdisk I/O
+       modes targeting the host OS buffer cache. It is invoked as
+       follows
+
+       td-rated -t meminfo -- ..
+       
+       --high <limit>
+               [% of total memory]
+
+       --low <limit>
+               [% of total memory]
+
+       [--period <time>]
+               Memory stats update period [ms]
+               Default: 100
+
+       -t <type> ...
+               Subordinate rate limiter type.
+
+       -- [ subordinate options .. ]
+
+       Where the subordinate type and options typically invokes one
+       of the basic rate-oriented algorithms described above.
+
+       Memory limits are not bandwidth limits, but cache utilization
+       bounds aimed to be met. The arguments to --high and --low
+       options are watermarks setting hysteresis limits on domain OS
+       cache utilization detected. They are defined in percent of
+       total memory available to the domain OS.
+       
+       The driver periodically scans OS memory statistics to estimate
+       present host buffer I/O loads. By default a state update is
+       performed every 100ms.
+
+       The cache is considered underutilized while the amount of
+       memory either modified, or under writeback, does not exceed
+       the percentage indicated by --high. In that state, I/O will
+       pass unrestricted.
+
+       Once the --high limit is exceeded, a congestion mode of
+       operation is entered, where the output data rate is
+       reduced. That state prevails until the cache is detected
+       underutilized again, at a value below or equal the --low
+       watermark.
+
+       Meminfo rate limiting is driven by overall domain state,
+       commonly involving applications not sharing the same domain of
+       bandwidth arbitration. I/O can therefore only be throttled,
+       not blocked, or would risk starvation. For that purpose, the
+       meminfo driver requires a (configurable) subordinate rate
+       limiter. This may be any of the raw bandwidth-oriented
+       implementations available.
+
+    Limit Formats
+
+        I/O size and limit values specified at td-rated invocation
+        time are integers in units of bytes, or integers as multiples
+        of units given in either SI decimal (K,M,G) or IEC binary
+        (Ki,Mi,Gi) suffix notation, e.g. 10k (10 * 2^10 B), 128Mi (128
+        * 10^6 B), 1Gi (1 * 10^9 B).
+
+EXAMPLES
+
+    Invocations
+ 
+       td-rated /var/run/blktap/x.sk -t leaky -- \
+               --rate=60M
+
+         Constant-rate output rate limit at 60M/s. Listening for
+         client connections at /var/run/blktap/x.sk.
+
+       td-rated /var/run/blktap/y.sk -t token -- \
+               --rate=80M --cap 10M
+
+         Token bucket rate limiting at 80M/s with a burst limit of 10M.
+       
+       td-rated /var/run/blktap/y.sk -t meminfo -- \
+               --low=40 --high=60 -t leaky -- --rate=15M
+
+         Buffer I/O rate limiting with a high/low cache utilization
+         watermark of 60%/40% of host memory. Once the upper limit is
+         met, constant rate output targeting a limit of 10M/s is
+         applied.
+
+    Image Chain
+
+       tap-ctl create x-chain:/var/tmp/limit.chain
+
+       /var/tmp/limit.chain:
+               valve:/var/run/blktap/x.sk
+               vhd:/dev/vg/image.vhd
+
+BUGS
+
+    The -t leaky type isn't really aliased yet properly.
+    Use the form -t token -- --cap=0 instead.
diff --git a/tools/blktap3/drivers/td-rated.c b/tools/blktap3/drivers/td-rated.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/td-rated.c
@@ -0,0 +1,1722 @@
+/*
+ * Copyright (c) 2011, Citrix Systems.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <getopt.h>
+#include <syslog.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/time.h>
+
+#include "block-valve.h"
+#include "compiler.h"
+#include "list.h"
+
+static void
+rlb_vlog_vfprintf(int prio, const char *fmt, va_list ap)
+{
+       vfprintf(stderr, fmt, ap); fputc('\n', stderr);
+}
+
+static void (*rlb_vlog)(int prio, const char *fmt, va_list ap);
+
+__printf(2, 3)
+static void
+rlb_log(int prio, const char *fmt, ...)
+{
+       va_list ap;
+       va_start(ap, fmt); rlb_vlog(prio, fmt, ap); va_end(ap);
+}
+
+static int debug = 0;
+
+#define DBG(_l, _f, _a...) if (debug >= _l) { rlb_log(LOG_DEBUG, _f, ##_a); }
+#define INFO(_f, _a...)    rlb_log(LOG_INFO, _f, ##_a)
+#define WARN(_f, _a...)    rlb_log(LOG_WARNING, "WARNING: " _f ", in %s:%d", \
+                                  ##_a, __func__, __LINE__)
+#define ERR(_f, _a...)     rlb_log(LOG_ERR, "ERROR: " _f ", in %s:%d", \
+                                  ##_a, __func__, __LINE__)
+#define PERROR(_f, _a...)  rlb_log(LOG_ERR, _f ": %s in %s:%d", \
+                                  ##_a, strerror(errno), __func__, __LINE__)
+
+#define BUG() do {                                             \
+               ERR("Aborting");                                \
+               abort();                                        \
+       } while (0)
+
+#define BUG_ON(_cond)                                          \
+       if (unlikely(_cond)) {                                  \
+               ERR("(%s) = %d", #_cond, _cond);                \
+               BUG();                                          \
+       }
+
+#define WARN_ON(_cond) ({                                      \
+       int __cond = _cond;                                     \
+       if (unlikely(__cond))                                   \
+               WARN("(%s) = %d", #_cond, _cond);               \
+       __cond;                                         \
+})
+
+#define MAX(a, b)       ((a) > (b) ? (a) : (b))
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
+
+#define ARRAY_SIZE(_a)  (sizeof(_a)/sizeof((_a)[0]))
+
+typedef struct ratelimit_bridge        td_rlb_t;
+typedef struct ratelimit_connection    td_rlb_conn_t;
+
+struct ratelimit_connection {
+       int                            sock;
+
+       unsigned long                  need; /* I/O requested */
+       unsigned long                  gntd; /* I/O granted, pending */
+
+       struct list_head               open; /* connected */
+       struct list_head               wait; /* need > 0 */
+
+       struct {
+               struct timeval         since;
+               struct timeval         total;
+       } wstat;
+};
+
+#define RLB_CONN_MAX                   1024
+
+struct ratelimit_ops {
+       void    (*usage)(td_rlb_t *rlb, FILE *stream, void *data);
+
+       int     (*create)(td_rlb_t *rlb, int argc, char **argv, void **data);
+       void    (*destroy)(td_rlb_t *rlb, void *data);
+
+       void    (*info)(td_rlb_t *rlb, void *data);
+
+       void    (*settimeo)(td_rlb_t *rlb, struct timeval **tv, void *data);
+       void    (*timeout)(td_rlb_t *rlb, void *data);
+       void    (*dispatch)(td_rlb_t *rlb, void *data);
+       void    (*reset)(td_rlb_t *rlb, void *data);
+};
+
+struct ratelimit_bridge {
+       char                          *name;
+       char                          *ident;
+
+       struct sockaddr_un             addr;
+       char                          *path;
+       int                            sock;
+
+       struct list_head               open; /* all connections */
+       struct list_head               wait; /* all in need */
+
+       struct timeval                 ts, now;
+
+       td_rlb_conn_t                  connv[RLB_CONN_MAX];
+       td_rlb_conn_t                 *free[RLB_CONN_MAX];
+       int                            n_free;
+
+       struct rlb_valve {
+               struct ratelimit_ops  *ops;
+               void                  *data;
+       } valve;
+};
+
+#define rlb_for_each_conn(_conn, _rlb)                                 \
+       list_for_each_entry(_conn, &(_rlb)->open, open)
+
+#define rlb_for_each_conn_safe(_conn, _next, _rlb)                     \
+       list_for_each_entry_safe(_conn, _next, &(_rlb)->open, open)
+
+#define rlb_for_each_waiting(_conn, _next, _rlb)                       \
+       list_for_each_entry(_conn, _next, &(_rlb)->wait, wait)
+
+#define rlb_for_each_waiting_safe(_conn, _next, _rlb)                  \
+       list_for_each_entry_safe(_conn, _next, &(_rlb)->wait, wait)
+
+#define rlb_conn_entry(_list)                  \
+       list_entry(_list, td_rlb_conn_t, open)
+
+#define rlb_wait_entry(_list)                  \
+       list_entry(_list, td_rlb_conn_t, wait)
+
+static struct ratelimit_ops *rlb_find_valve(const char *name);
+
+static int rlb_create_valve(td_rlb_t *, struct rlb_valve *,
+                           const char *name, int argc, char **argv);
+
+/*
+ * util
+ */
+
+#define case_G case 'G': case 'g'
+#define case_M case 'M': case 'm'
+#define case_K case 'K': case 'k'
+
+static long
+rlb_strtol(const char *s)
+{
+       unsigned long l, u = 1;
+       char *end, p, q;
+
+       l = strtoul(s, &end, 0);
+       if (!*end)
+               return l;
+
+       p = *end++;
+
+       switch (p) {
+       case_G: case_M: case_K:
+
+               q = *end++;
+
+               switch (q) {
+               case 'i':
+                       switch (p) {
+                       case_G:
+                               u *= 1024;
+                       case_M:
+                               u *= 1024;
+                       case_K:
+                               u *= 1024;
+                       }
+                       break;
+
+               case 0:
+                       switch (p) {
+                       case_G:
+                               u *= 1000;
+                       case_M:
+                               u *= 1000;
+                       case_K:
+                               u *= 1000;
+                       }
+                       break;
+
+               default:
+                       goto fail;
+               }
+               break;
+
+       case 0:
+               break;
+
+       default:
+               goto fail;
+       }
+
+       return l * u;
+
+fail:
+       return -EINVAL;
+}
+
+static char*
+vmprintf(const char *fmt, va_list ap)
+{
+       char *s;
+       int n;
+
+       n = vasprintf(&s, fmt, ap);
+       if (n < 0)
+               s = NULL;
+
+       return s;
+}
+
+__printf(1, 2)
+static char*
+mprintf(const char *fmt, ...)
+{
+       va_list ap;
+       char *s;
+
+       va_start(ap, fmt);
+       s = vmprintf(fmt, ap);
+       va_end(ap);
+
+       return s;
+}
+
+static int
+sysctl_vscanf(const char *name, const char *fmt, va_list ap)
+{
+       char *path = NULL;
+       FILE *s = NULL;
+       int rv;
+
+       path = mprintf("/proc/sys/%s", name);
+       if (!path) {
+               rv = -errno;
+               goto fail;
+       }
+
+       s = fopen(path, "r");
+       if (!s) {
+               rv = -errno;
+               goto fail;
+       }
+
+       rv = vfscanf(s, fmt, ap);
+fail:
+       if (s)
+               fclose(s);
+
+       if (path)
+               free(path);
+
+       return rv;
+}
+
+static int
+sysctl_scanf(const char *name, const char *fmt, ...)
+{
+       va_list(ap);
+       int rv;
+
+       va_start(ap, fmt);
+       rv = sysctl_vscanf(name, fmt, ap);
+       va_end(ap);
+
+       return rv;
+}
+
+static long
+sysctl_strtoul(const char *name)
+{
+       unsigned val;
+       int n;
+
+       n = sysctl_scanf(name, "%lu", &val);
+       if (n < 0)
+               return n;
+       if (n != 1)
+               return -EINVAL;
+
+       return val;
+}
+
+
+static long long
+rlb_tv_usec(const struct timeval *tv)
+{
+       long long us;
+
+       us  = tv->tv_sec;
+       us *= 1000000;
+       us += tv->tv_usec;
+
+       return us;
+}
+
+static long long
+rlb_usec_since(td_rlb_t *rlb, const struct timeval *since)
+{
+       struct timeval delta;
+
+       timersub(&rlb->now, since, &delta);
+
+       return rlb_tv_usec(&delta);
+}
+
+static inline void
+rlb_argv_shift(int *optind, int *argc, char ***argv)
+{
+       /* reset optind and args after '--' */
+
+       *optind -= 1;
+
+       *argc   -= *optind;
+       *argv   += *optind;
+
+       *optind  = 1;
+}
+
+/*
+ * socket I/O
+ */
+
+static void
+rlb_sock_close(td_rlb_t *rlb)
+{
+       if (rlb->path) {
+               unlink(rlb->path);
+               rlb->path = NULL;
+       }
+
+       if (rlb->sock >= 0) {
+               close(rlb->sock);
+               rlb->sock = -1;
+       }
+}
+
+static int
+rlb_sock_open(td_rlb_t *rlb)
+{
+       int s, err;
+
+       rlb->sock = -1;
+
+       s = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (s < 0) {
+               PERROR("socket");
+               err = -errno;
+               goto fail;
+       }
+
+       rlb->sock = s;
+
+       rlb->addr.sun_family = AF_UNIX;
+
+       if (rlb->name[0] == '/')
+               strncpy(rlb->addr.sun_path, rlb->name,
+                       sizeof(rlb->addr.sun_path));
+       else
+               snprintf(rlb->addr.sun_path, sizeof(rlb->addr.sun_path),
+                        "%s/%s", TD_VALVE_SOCKDIR, rlb->name);
+
+       err = bind(rlb->sock, &rlb->addr, sizeof(rlb->addr));
+       if (err) {
+               PERROR("%s", rlb->addr.sun_path);
+               err = -errno;
+               goto fail;
+       }
+
+       rlb->path = rlb->addr.sun_path;
+
+       err = listen(rlb->sock, RLB_CONN_MAX);
+       if (err) {
+               PERROR("listen(%s)", rlb->addr.sun_path);
+               err = -errno;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       rlb_sock_close(rlb);
+       return err;
+}
+
+static int
+rlb_sock_send(td_rlb_t *rlb, td_rlb_conn_t *conn,
+             const void *msg, size_t size)
+{
+       ssize_t n;
+
+       n = send(conn->sock, msg, size, MSG_DONTWAIT);
+       if (n < 0)
+               return -errno;
+       if (n && n != size)
+               return -EPROTO;
+
+       return 0;
+}
+
+static int
+rlb_sock_recv(td_rlb_t *rlb, td_rlb_conn_t *conn,
+             void *msg, size_t size)
+{
+       ssize_t n;
+
+       n = recv(conn->sock, msg, size, MSG_DONTWAIT);
+       if (n < 0)
+               return -errno;
+
+       return n;
+}
+
+static td_rlb_conn_t *
+rlb_conn_alloc(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn = NULL;
+
+       if (likely(rlb->n_free > 0))
+               conn = rlb->free[--rlb->n_free];
+
+       return conn;
+}
+
+static void
+rlb_conn_free(td_rlb_t *rlb, td_rlb_conn_t *conn)
+{
+       BUG_ON(rlb->n_free >= RLB_CONN_MAX);
+
+       rlb->free[rlb->n_free++] = conn;
+}
+
+static int
+rlb_conn_id(td_rlb_t *rlb, td_rlb_conn_t *conn)
+{
+       return conn - rlb->connv;
+}
+
+static void
+rlb_conn_info(td_rlb_t *rlb, td_rlb_conn_t *conn)
+{
+       long long wtime;
+       int waits;
+
+       wtime = 0;
+       waits = !list_empty(&conn->wait);
+       if (waits)
+               wtime = rlb_usec_since(rlb, &conn->wstat.since) / 1000;
+
+       WARN_ON(!!conn->need != waits);
+
+       INFO("conn[%d] needs %lu (since %llu ms, total %lu.%06lu s),"
+            " %lu granted",
+            rlb_conn_id(rlb, conn), conn->need, wtime,
+            conn->wstat.total.tv_sec, conn->wstat.total.tv_usec,
+            conn->gntd);
+}
+
+static void
+rlb_conn_infos(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn;
+
+       rlb_for_each_conn(conn, rlb)
+               rlb_conn_info(rlb, conn);
+}
+
+static void
+rlb_conn_close(td_rlb_t *rlb, td_rlb_conn_t *conn)
+{
+       int s = conn->sock;
+
+       INFO("Connection %d closed.", rlb_conn_id(rlb, conn));
+       rlb_conn_info(rlb, conn);
+
+       if (s) {
+               close(s);
+               conn->sock = -1;
+       }
+
+       list_del_init(&conn->wait);
+       list_del(&conn->open);
+
+       rlb_conn_free(rlb, conn);
+}
+
+static void
+rlb_conn_receive(td_rlb_t *rlb, td_rlb_conn_t *conn)
+{
+       struct td_valve_req buf[32], req = { -1, -1 };
+       ssize_t n;
+       int i, err;
+
+       n = rlb_sock_recv(rlb, conn, buf, sizeof(buf));
+       if (!n)
+               goto close;
+
+       if (n < 0) {
+               err = n;
+               if (err != -EAGAIN)
+                       goto fail;
+       }
+
+       if (unlikely(n % sizeof(req))) {
+               err = -EPROTO;
+               goto fail;
+       }
+
+       for (i = 0; i < n / sizeof(buf[0]); i++) {
+               req = buf[i];
+
+               if (unlikely(req.need > TD_RLB_REQUEST_MAX)) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               if (unlikely(req.done > conn->gntd)) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               conn->need += req.need;
+               conn->gntd -= req.done;
+
+               DBG(8, "rcv: %lu/%lu need=%lu gntd=%lu",
+                   req.need, req.done, conn->need, conn->gntd);
+
+               if (unlikely(conn->need > TD_RLB_REQUEST_MAX)) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+       }
+
+       if (conn->need && list_empty(&conn->wait)) {
+               list_add_tail(&conn->wait, &rlb->wait);
+               conn->wstat.since = rlb->now;
+       }
+
+       return;
+
+fail:
+       WARN("err = %d (%s)"
+            " (need %ld/%ld, %ld/%ld done),"
+            " closing connection.",
+            err, strerror(-err),
+            req.need, conn->need, req.done, conn->gntd);
+
+       rlb_conn_info(rlb, conn);
+close:
+       rlb_conn_close(rlb, conn);
+}
+
+static void
+rlb_conn_respond(td_rlb_t *rlb, td_rlb_conn_t *conn, unsigned long need)
+{
+       int err;
+
+       BUG_ON(need > conn->need);
+
+       err = rlb_sock_send(rlb, conn, &need, sizeof(need));
+       if (err)
+               goto fail;
+
+       conn->need -= need;
+       conn->gntd += need;
+
+       DBG(8, "snd: %lu need=%lu gntd=%lu", need, conn->need, conn->gntd);
+
+       if (!conn->need) {
+               struct timeval delta;
+
+               timersub(&rlb->now, &conn->wstat.since, &delta);
+               timeradd(&conn->wstat.total, &delta, &conn->wstat.total);
+
+               list_del_init(&conn->wait);
+       }
+
+       return;
+
+fail:
+       WARN("err = %d, killing connection.", err);
+       rlb_conn_close(rlb, conn);
+}
+
+static void
+rlb_accept_conn(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn;
+       int s, err;
+
+       s = accept(rlb->sock, NULL, NULL);
+       if (!s) {
+               err = -errno;
+               goto fail;
+       }
+
+       conn = rlb_conn_alloc(rlb);
+       if (!conn) {
+               err = -ENOMEM;
+               close(s);
+               goto fail;
+       }
+
+       INFO("Accepting connection %td.", conn - rlb->connv);
+
+       memset(conn, 0, sizeof(*conn));
+       INIT_LIST_HEAD(&conn->wait);
+       conn->sock = s;
+       list_add_tail(&conn->open, &rlb->open);
+
+       return;
+
+fail:
+       WARN("err = %d", err);
+}
+
+static long long
+rlb_pending(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn;
+       long long pend = 0;
+
+       rlb_for_each_conn(conn, rlb)
+               pend += conn->gntd;
+
+       return pend;
+}
+
+/*
+ * token bucket valve
+ */
+
+typedef struct ratelimit_token td_rlb_token_t;
+
+struct ratelimit_token {
+       long                      cred;
+       long                      cap;
+       long                      rate;
+       struct timeval            timeo;
+};
+
+static void
+rlb_token_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data)
+{
+       td_rlb_token_t *token = data;
+       struct timeval *tv = &token->timeo;
+       long long us;
+
+       if (list_empty(&rlb->wait)) {
+               *_tv = NULL;
+               return;
+       }
+
+       WARN_ON(token->cred >= 0);
+
+       us  = -token->cred;
+       us *= 1000000;
+       us /= token->rate;
+
+       tv->tv_sec  = us / 1000000;
+       tv->tv_usec = us % 1000000;
+
+       WARN_ON(!timerisset(tv));
+
+       *_tv = tv;
+}
+
+static void
+rlb_token_refill(td_rlb_t *rlb, td_rlb_token_t *token)
+{
+       struct timeval tv;
+       long long cred, max_usec;
+
+       /* max time needed to refill up to cap */
+
+       max_usec  = token->cap - token->cred;
+       max_usec *= 1000000;
+       max_usec += token->rate - 1;
+       max_usec /= token->rate;
+
+       /* actual credit gained */
+
+       timersub(&rlb->now, &rlb->ts, &tv);
+
+       cred  = rlb_tv_usec(&tv);
+       cred  = MIN(cred, max_usec);
+       cred *= token->rate;
+       cred /= 1000000;
+
+       /* up to cap */
+
+       token->cred += cred;
+       token->cred  = MIN(token->cred, token->cap);
+}
+
+static void
+rlb_token_dispatch(td_rlb_t *rlb, void *data)
+{
+       td_rlb_token_t *token = data;
+       td_rlb_conn_t *conn, *next;
+
+       rlb_token_refill(rlb, token);
+
+       rlb_for_each_waiting_safe(conn, next, rlb) {
+               if (token->cred < 0)
+                       break;
+
+               token->cred -= conn->need;
+
+               rlb_conn_respond(rlb, conn, conn->need);
+       }
+}
+
+static void
+rlb_token_reset(td_rlb_t *rlb, void *data)
+{
+       td_rlb_token_t *token = data;
+
+       token->cred = token->cap;
+}
+
+static void
+rlb_token_destroy(td_rlb_t *rlb, void *data)
+{
+       td_rlb_token_t *token = data;
+
+       if (token)
+               free(token);
+}
+
+static int
+rlb_token_create(td_rlb_t *rlb, int argc, char **argv, void **data)
+{
+       td_rlb_token_t *token;
+       int err;
+
+       token = calloc(1, sizeof(*token));
+       if (!token) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
+       token->rate = 0;
+       token->cap  = 0;
+
+       do {
+               const struct option longopts[] = {
+                       { "rate",        1, NULL, 'r' },
+                       { "cap",         1, NULL, 'c' },
+                       { NULL,          0, NULL,  0  }
+               };
+               int c;
+
+               c = getopt_long(argc, argv, "r:c:", longopts, NULL);
+               if (c < 0)
+                       break;
+
+               switch (c) {
+               case 'r':
+                       token->rate = rlb_strtol(optarg);
+                       if (token->rate < 0) {
+                               ERR("invalid --rate");
+                               goto usage;
+                       }
+                       break;
+
+               case 'c':
+                       token->cap = rlb_strtol(optarg);
+                       if (token->cap < 0) {
+                               ERR("invalid --cap");
+                               goto usage;
+                       }
+                       break;
+
+               case '?':
+                       goto usage;
+
+               default:
+                       BUG();
+               }
+       } while (1);
+
+       if (!token->rate) {
+               ERR("--rate required");
+               goto usage;
+       }
+
+       rlb_token_reset(rlb, token);
+
+       *data = token;
+
+       return 0;
+
+fail:
+       if (token)
+               free(token);
+
+       return err;
+
+usage:
+       err = -EINVAL;
+       goto fail;
+}
+
+static void
+rlb_token_usage(td_rlb_t *rlb, FILE *stream, void *data)
+{
+       fprintf(stream,
+               " {-t|--type}=token --"
+               " {-r|--rate}=<rate [KMG]>"
+               " {-c|--cap}=<size [KMG]>");
+}
+
+static void
+rlb_token_info(td_rlb_t *rlb, void *data)
+{
+       td_rlb_token_t *token = data;
+
+       INFO("TOKEN: rate: %ld B/s cap: %ld B cred: %ld B",
+            token->rate, token->cap, token->cred);
+}
+
+static struct ratelimit_ops rlb_token_ops = {
+       .usage    = rlb_token_usage,
+       .create   = rlb_token_create,
+       .destroy  = rlb_token_destroy,
+       .info     = rlb_token_info,
+
+       .settimeo = rlb_token_settimeo,
+       .timeout  = rlb_token_dispatch,
+       .dispatch = rlb_token_dispatch,
+       .reset    = rlb_token_reset,
+};
+
+/*
+ * meminfo valve
+ */
+
+typedef struct ratelimit_meminfo td_rlb_meminfo_t;
+
+struct ratelimit_meminfo {
+       unsigned int                   period;
+       struct timeval                 ts;
+
+       FILE                          *s;
+
+       unsigned long                  total;
+       unsigned long                  dirty;
+       unsigned long                  writeback;
+
+       unsigned int                   limit_hi;
+       unsigned int                   limit_lo;
+       unsigned int                   congested;
+
+       struct rlb_valve               valve;
+       struct timeval                 timeo;
+};
+
+static void
+rlb_meminfo_info(td_rlb_t *rlb, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+
+       INFO("MEMINFO: lo/hi: %u/%u%% period: %u ms",
+            m->limit_lo, m->limit_hi, m->period);
+
+       INFO("MEMINFO: total %lu kB, dirty/writeback %lu/%lu kB",
+            m->total, m->dirty, m->writeback);
+
+       m->valve.ops->info(rlb, m->valve.data);
+}
+
+static void
+rlb_meminfo_close(td_rlb_meminfo_t *m)
+{
+       if (m->s) {
+               fclose(m->s);
+               m->s = NULL;
+       }
+}
+
+static int
+rlb_meminfo_open(td_rlb_meminfo_t *m)
+{
+       FILE *s;
+       int err;
+
+       m->s = NULL;
+
+       s = fopen("/proc/meminfo", "r");
+       if (!s) {
+               err = -errno;
+               goto fail;
+       }
+
+       m->s = s;
+
+       return 0;
+
+fail:
+       rlb_meminfo_close(m);
+       return err;
+}
+
+static inline int __test_bit(int n, unsigned long *bitmap)
+{
+       return !!(*bitmap & (1UL<<n));
+}
+
+static inline void __clear_bit(int n, unsigned long *bitmap)
+{
+       *bitmap &= ~(1UL<<n);
+}
+
+static struct ratelimit_meminfo_scan {
+       const char    *format;
+       ptrdiff_t      ptrdiff;
+} rlb_meminfo_scanfs[] = {
+       { "MemTotal:  %lu kB",
+         offsetof(struct ratelimit_meminfo, total) },
+       { "Dirty:     %lu kB",
+         offsetof(struct ratelimit_meminfo, dirty) },
+       { "Writeback: %lu kB",
+         offsetof(struct ratelimit_meminfo, writeback) },
+};
+
+static int
+rlb_meminfo_scan(td_rlb_meminfo_t *m)
+{
+       const int n_keys = ARRAY_SIZE(rlb_meminfo_scanfs);
+       unsigned long pending;
+       int err;
+
+       err = rlb_meminfo_open(m);
+       if (err)
+               goto fail;
+
+       pending = (1UL << n_keys) - 1;
+
+       do {
+               char buf[80], *b;
+               int i;
+
+               b = fgets(buf, sizeof(buf), m->s);
+               if (!b)
+                       break;
+
+               for (i = 0; i < n_keys; i++) {
+                       struct ratelimit_meminfo_scan *scan;
+                       unsigned long val, *ptr;
+                       int n;
+
+                       if (!__test_bit(i, &pending))
+                               continue;
+
+                       scan = &rlb_meminfo_scanfs[i];
+
+                       n = sscanf(buf, scan->format, &val);
+                       if (n != 1)
+                               continue;
+
+                       ptr  = (void*)m + scan->ptrdiff;
+                       *ptr = val;
+
+                       __clear_bit(i, &pending);
+               }
+
+       } while (pending);
+
+       if (pending) {
+               err = -ESRCH;
+               goto fail;
+       }
+
+       err = 0;
+fail:
+       rlb_meminfo_close(m);
+       return err;
+}
+
+static void
+rlb_meminfo_usage(td_rlb_t *rlb, FILE *stream, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+
+       fprintf(stream,
+               " {-t|--type}=meminfo "
+               " {-H|--high}=<percent> {-L|--low}=<percent>"
+               " {-p|--period}=<msecs> --");
+
+       if (m && m->valve.ops) {
+               m->valve.ops->usage(rlb, stream, m->valve.data);
+       } else
+               fprintf(stream, " {-t|--type}={...}");
+}
+
+static void
+rlb_meminfo_destroy(td_rlb_t *rlb, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+
+       if (m) {
+               if (m->valve.data) {
+                       m->valve.ops->destroy(rlb, m->valve.data);
+                       m->valve.data = NULL;
+               }
+
+               free(m);
+       }
+}
+
+static int
+rlb_meminfo_create(td_rlb_t *rlb, int argc, char **argv, void **data)
+{
+       td_rlb_meminfo_t *m;
+       const char *type;
+       long dbr;
+       int err;
+
+       m = calloc(1, sizeof(*m));
+       if (!m) {
+               PERROR("calloc");
+               err = -errno;
+               goto fail;
+       }
+
+       type      = NULL;
+       m->period = 100;
+
+       do {
+               const struct option longopts[] = {
+                       { "period",    1, NULL, 'p' },
+                       { "type",      1, NULL, 't' },
+                       { "high",      1, NULL, 'H' },
+                       { "low",       1, NULL, 'L' },
+                       { NULL,        0, NULL,  0  }
+               };
+               int c;
+
+               c = getopt_long(argc, argv, "p:t:H:L:", longopts, NULL);
+               if (c < 0)
+                       break;
+
+               switch (c) {
+               case 'p':
+                       m->period = rlb_strtol(optarg);
+                       if (m->period < 0)
+                               goto usage;
+                       break;
+
+               case 'H':
+                       m->limit_hi = strtoul(optarg, NULL, 0);
+                       break;
+
+               case 'L':
+                       m->limit_lo = strtoul(optarg, NULL, 0);
+                       break;
+
+               case 't':
+                       type = optarg;
+                       break;
+
+               case '?':
+                       goto usage;
+
+               default:
+                       BUG();
+               }
+       } while (1);
+
+       if (!m->limit_hi || !m->limit_lo) {
+               ERR("--high/--low required");
+               goto usage;
+       }
+
+       if (m->limit_lo >= m->limit_hi) {
+               ERR("invalid --high/--low ratio");
+               goto usage;
+       }
+
+       if (!type) {
+               ERR("(sub) --type required");
+               goto usage;
+       }
+
+       dbr = sysctl_strtoul("vm/dirty_background_ratio");
+       if (dbr < 0) {
+               err = dbr;
+               ERR("vm/dirty_background_ratio: %d", err);
+               goto fail;
+       }
+
+       if (0 && m->limit_lo < dbr) {
+               ERR("--low %u is less than vm.dirty_background_ratio (= %ld)",
+                   m->limit_lo, dbr);
+               err = -EINVAL;
+               goto fail;
+       }
+
+       *data = m;
+
+       rlb_argv_shift(&optind, &argc, &argv);
+
+       err = rlb_create_valve(rlb, &m->valve, type, argc, argv);
+       if (err) {
+               if (err == -EINVAL)
+                       goto usage;
+               goto fail;
+       }
+
+       err = rlb_meminfo_scan(m);
+       if (err) {
+               PERROR("/proc/meminfo");
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       ERR("err = %d", err);
+       return err;
+
+usage:
+       err = -EINVAL;
+       return err;
+};
+
+static void
+rlb_meminfo_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+       int idle;
+
+       idle = list_empty(&rlb->wait);
+       BUG_ON(!idle && !m->congested);
+
+       if (m->congested) {
+               m->valve.ops->settimeo(rlb, _tv, m->valve.data);
+               return;
+       }
+
+       *_tv = NULL;
+}
+
+static void
+rlb_meminfo_timeout(td_rlb_t *rlb, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+
+       WARN_ON(!m->congested);
+
+       if (m->congested)
+               m->valve.ops->timeout(rlb, m->valve.data);
+}
+
+static int
+rlb_meminfo_test_high(td_rlb_t *rlb, td_rlb_meminfo_t *m, long long cred)
+{
+       long long lo;
+
+       if (m->congested) {
+               /* hysteresis */
+
+               lo  = m->total;
+               lo *= m->limit_lo;
+               lo /= 100;
+
+               if (cred >= lo)
+                       return 0;
+
+       } else
+               if (cred <= 0) {
+                       m->valve.ops->reset(rlb, m->valve.data);
+                       return 1;
+               }
+
+       return m->congested;
+}
+
+static void
+rlb_meminfo_dispatch_low(td_rlb_t *rlb, td_rlb_meminfo_t *m,
+                        long long *_cred)
+{
+       td_rlb_conn_t *conn, *next;
+       long long cred = *_cred, grant;
+
+       rlb_for_each_waiting_safe(conn, next, rlb) {
+
+               if (cred <= 0)
+                       break;
+
+               grant = MIN(cred, conn->need);
+
+               rlb_conn_respond(rlb, conn, grant);
+
+               cred -= grant;
+       }
+
+       *_cred = cred;
+}
+
+static void
+rlb_meminfo_dispatch(td_rlb_t *rlb, void *data)
+{
+       td_rlb_meminfo_t *m = data;
+       long long us, hi, cred, dirty, pend;
+
+       /* we run only once per m->period */
+
+       us = rlb_usec_since(rlb, &m->ts);
+       if (us / 1000 > m->period) {
+               rlb_meminfo_scan(m);
+               m->ts = rlb->now;
+       }
+
+       /* uncongested credit:
+          memory below hi watermark minus pending I/O */
+
+       hi  = m->total;
+       hi *= m->limit_hi;
+       hi /= 100;
+
+       dirty = m->dirty + m->writeback;
+
+       cred  = hi - dirty;
+       cred *= 1000;
+
+       pend  = rlb_pending(rlb);
+       cred -= pend;
+
+       m->congested = rlb_meminfo_test_high(rlb, m, cred);
+
+       DBG(3, "dirty=%lld (%lld) pend=%llu cred=%lld %s",
+           dirty, dirty * 100 / m->total, pend, cred,
+           m->congested ? "congested" : "");
+
+       if (!m->congested) {
+               rlb_meminfo_dispatch_low(rlb, m, &cred);
+
+               m->congested = rlb_meminfo_test_high(rlb, m, cred);
+       }
+
+       if (m->congested)
+               m->valve.ops->dispatch(rlb, m->valve.data);
+}
+
+static struct ratelimit_ops rlb_meminfo_ops = {
+       .usage    = rlb_meminfo_usage,
+       .create   = rlb_meminfo_create,
+       .destroy  = rlb_meminfo_destroy,
+       .info     = rlb_meminfo_info,
+
+       .settimeo = rlb_meminfo_settimeo,
+       .timeout  = rlb_meminfo_timeout,
+       .dispatch = rlb_meminfo_dispatch,
+};
+
+/*
+ * main loop
+ */
+
+static void
+rlb_info(td_rlb_t *rlb)
+{
+       rlb->valve.ops->info(rlb, rlb->valve.data);
+
+       rlb_conn_infos(rlb);
+}
+
+static sigset_t rlb_sigunblock;
+static sigset_t rlb_sigpending;
+
+static void
+rlb_sigmark(int signo)
+{
+       INFO("Caught SIG%d", signo);
+       sigaddset(&rlb_sigpending, signo);
+}
+
+static int
+rlb_siginit(void)
+{
+       struct sigaction sa_ignore  = { .sa_handler = SIG_IGN };
+       struct sigaction sa_pending = { .sa_handler = rlb_sigmark };
+       sigset_t sigmask;
+       int err = 0;
+
+       if (!err)
+               err = sigaction(SIGPIPE, &sa_ignore, NULL);
+       if (!err)
+               err = sigaction(SIGINT,  &sa_pending, NULL);
+       if (!err)
+               err = sigaction(SIGTERM, &sa_pending, NULL);
+       if (!err)
+               err = sigaction(SIGUSR1, &sa_pending, NULL);
+       if (err) {
+               err = -errno;
+               goto fail;
+       }
+
+       sigemptyset(&sigmask);
+       sigaddset(&sigmask, SIGINT);
+       sigaddset(&sigmask, SIGTERM);
+       sigaddset(&sigmask, SIGUSR1);
+
+       err = sigprocmask(SIG_BLOCK, &sigmask, &rlb_sigunblock);
+       if (err) {
+               err = -errno;
+               goto fail;
+       }
+
+fail:
+       return err;
+}
+
+static int
+rlb_main_signaled(td_rlb_t *rlb)
+{
+       if (sigismember(&rlb_sigpending, SIGUSR1))
+               rlb_info(rlb);
+
+       if (sigismember(&rlb_sigpending, SIGINT) ||
+           sigismember(&rlb_sigpending, SIGTERM))
+               return -EINTR;
+
+       return 0;
+}
+
+
+static struct ratelimit_ops *
+rlb_find_valve(const char *name)
+{
+       struct ratelimit_ops *ops = NULL;
+
+       switch (name[0]) {
+#if 0
+       case 'l':
+               if (!strcmp(name, "leaky"))
+                       ops = &rlb_leaky_ops;
+               break;
+#endif
+
+       case 't':
+               if (!strcmp(name, "token"))
+                       ops = &rlb_token_ops;
+               break;
+
+       case 'm':
+               if (!strcmp(name, "meminfo"))
+                       ops = &rlb_meminfo_ops;
+               break;
+       }
+
+       return ops;
+}
+
+static int
+rlb_main_iterate(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn, *next;
+       struct timeval *tv;
+       struct timespec _ts, *ts = &_ts;
+       int nfds, err;
+       fd_set rfds;
+
+       FD_ZERO(&rfds);
+       nfds = 0;
+
+       if (stdin) {
+               FD_SET(STDIN_FILENO, &rfds);
+               nfds = MAX(nfds, STDIN_FILENO);
+       }
+
+       if (rlb->sock >= 0) {
+               FD_SET(rlb->sock, &rfds);
+               nfds = MAX(nfds, rlb->sock);
+       }
+
+       rlb_for_each_conn(conn, rlb) {
+               FD_SET(conn->sock, &rfds);
+               nfds = MAX(nfds, conn->sock);
+       }
+
+       rlb->valve.ops->settimeo(rlb, &tv, rlb->valve.data);
+       if (tv) {
+               TIMEVAL_TO_TIMESPEC(tv, ts);
+       } else
+               ts = NULL;
+
+       rlb->ts = rlb->now;
+
+       nfds = pselect(nfds + 1, &rfds, NULL, NULL, ts, &rlb_sigunblock);
+       if (nfds < 0) {
+               err = -errno;
+               if (err != -EINTR)
+                       PERROR("select");
+               goto fail;
+       }
+
+       gettimeofday(&rlb->now, NULL);
+
+       if (!nfds) {
+               BUG_ON(!ts);
+               rlb->valve.ops->timeout(rlb, rlb->valve.data);
+       }
+
+       if (nfds) {
+               rlb_for_each_conn_safe(conn, next, rlb)
+                       if (FD_ISSET(conn->sock, &rfds)) {
+                               rlb_conn_receive(rlb, conn);
+                               if (!--nfds)
+                                       break;
+                       }
+
+               rlb->valve.ops->dispatch(rlb, rlb->valve.data);
+       }
+
+       if (unlikely(nfds)) {
+               if (FD_ISSET(STDIN_FILENO, &rfds)) {
+                       getc(stdin);
+                       rlb_info(rlb);
+                       nfds--;
+               }
+       }
+
+       if (unlikely(nfds)) {
+               if (FD_ISSET(rlb->sock, &rfds)) {
+                       rlb_accept_conn(rlb);
+                       nfds--;
+               }
+       }
+
+       BUG_ON(nfds);
+       err = 0;
+fail:
+       return err;
+}
+
+static int
+rlb_main_run(td_rlb_t *rlb)
+{
+       int err;
+
+       do {
+               err = rlb_main_iterate(rlb);
+               if (err) {
+                       if (err != -EINTR)
+                               break;
+
+                       err = rlb_main_signaled(rlb);
+                       if (err) {
+                               err = 0;
+                               break;
+                       }
+               }
+
+       } while (rlb->sock >= 0 || !list_empty(&rlb->open));
+
+       return err;
+}
+
+static void
+rlb_shutdown(td_rlb_t *rlb)
+{
+       td_rlb_conn_t *conn, *next;
+
+       rlb_for_each_conn_safe(conn, next, rlb)
+               rlb_conn_close(rlb, conn);
+
+       rlb_sock_close(rlb);
+}
+
+static void
+rlb_usage(td_rlb_t *rlb, const char *prog, FILE *stream)
+{
+       fprintf(stream, "Usage: %s <name>", prog);
+
+       if (rlb && rlb->valve.ops)
+               rlb->valve.ops->usage(rlb, stream, rlb->valve.data);
+       else
+               fprintf(stream,
+                       " {-t|--type}={token|meminfo}"
+                       " [-h|--help] [-D|--debug=<n>]");
+
+       fprintf(stream, "\n");
+}
+
+static void
+rlb_destroy(td_rlb_t *rlb)
+{
+       rlb_shutdown(rlb);
+
+       if (rlb->valve.data) {
+               rlb->valve.ops->destroy(rlb, rlb->valve.data);
+               rlb->valve.data = NULL;
+       }
+
+       if (rlb->name) {
+               free(rlb->name);
+               rlb->name = NULL;
+       }
+}
+
+static int
+rlb_create(td_rlb_t *rlb, const char *name)
+{
+       int i, err;
+
+       memset(rlb, 0, sizeof(*rlb));
+       INIT_LIST_HEAD(&rlb->open);
+       INIT_LIST_HEAD(&rlb->wait);
+       rlb->sock = -1;
+
+       for (i = RLB_CONN_MAX - 1; i >= 0; i--)
+               rlb_conn_free(rlb, &rlb->connv[i]);
+
+       rlb->name = strdup(name);
+       if (!rlb->name) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = rlb_sock_open(rlb);
+       if (err)
+               goto fail;
+
+       gettimeofday(&rlb->now, NULL);
+
+       return 0;
+
+fail:
+       WARN("err = %d", err);
+       rlb_destroy(rlb);
+       return err;
+}
+
+static int
+rlb_create_valve(td_rlb_t *rlb, struct rlb_valve *v,
+                const char *name, int argc, char **argv)
+{
+       struct ratelimit_ops *ops;
+       int err;
+
+       ops = rlb_find_valve(name);
+       if (!ops) {
+               ERR("No such driver: %s", name);
+               err = -ESRCH;
+               goto fail;
+       }
+
+       v->ops = ops;
+
+       err = v->ops->create(rlb, argc, argv, &v->data);
+
+fail:
+       return err;
+}
+
+static void
+rlb_openlog(const char *name, int facility)
+{
+       static char ident[32];
+
+       snprintf(ident, sizeof(ident), "%s[%d]", name, getpid());
+       ident[sizeof(ident)-1] = 0;
+
+       openlog(ident, 0, facility);
+
+       rlb_vlog = vsyslog;
+}
+
+int
+main(int argc, char **argv)
+{
+       td_rlb_t _rlb, *rlb;
+       const char *prog, *type;
+       int err;
+
+       setbuf(stdin, NULL);
+       setlinebuf(stderr);
+
+       rlb      = NULL;
+       prog     = basename(argv[0]);
+       type     = NULL;
+       rlb_vlog = rlb_vlog_vfprintf;
+
+       do {
+               const struct option longopts[] = {
+                       { "help",        0, NULL, 'h' },
+                       { "type",        1, NULL, 't' },
+                       { "debug",       0, NULL, 'D' },
+                       { NULL,          0, NULL,  0  },
+               };
+               int c;
+
+               c = getopt_long(argc, argv, "ht:D:", longopts, NULL);
+               if (c < 0)
+                       break;
+
+               switch (c) {
+               case 'h':
+                       rlb_usage(NULL, prog, stdout);
+                       return 0;
+
+               case 't':
+                       type = optarg;
+                       break;
+
+               case 'D':
+                       debug = strtoul(optarg, NULL, 0);
+                       break;
+
+               case '?':
+                       goto usage;
+
+               default:
+                       BUG();
+               }
+
+       } while (1);
+
+       if (!type)
+               goto usage;
+
+       if (argc - optind < 1)
+               goto usage;
+
+       err = rlb_siginit();
+       if (err)
+               goto fail;
+
+       err = rlb_create(&_rlb, argv[optind++]);
+       if (err)
+               goto fail;
+
+       rlb = &_rlb;
+
+       rlb_argv_shift(&optind, &argc, &argv);
+
+       err = rlb_create_valve(rlb, &rlb->valve, type, argc, argv);
+       if (err) {
+               if (err == -EINVAL)
+                       goto usage;
+               goto fail;
+       }
+
+       if (!debug) {
+               err = daemon(0, 0);
+               if (err)
+                       goto fail;
+
+               stdin = stdout = stderr = NULL;
+               rlb_openlog(prog, LOG_DAEMON);
+       }
+
+       INFO("TD ratelimit bridge: %s, pid %d", rlb->path, getpid());
+
+       rlb_info(rlb);
+
+       err = rlb_main_run(rlb);
+
+       if (err)
+               INFO("Exiting with status %d", -err);
+
+fail:
+       if (rlb)
+               rlb_destroy(rlb);
+
+       return -err;
+
+usage:
+       rlb_usage(rlb, prog, stderr);
+       err = -EINVAL;
+       goto fail;
+}
diff --git a/tools/blktap3/drivers/td.c b/tools/blktap3/drivers/td.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/drivers/td.c
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+#include "tapdisk-utils.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a )
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef enum {
+       TD_FIELD_HIDDEN  = 0,
+       TD_FIELD_INVALID = 1
+} td_field_t;
+
+struct vdi_field {
+       char       *name;
+       td_field_t  id;
+};
+
+static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = {
+       { .id = TD_FIELD_HIDDEN, .name = "hidden" }
+};
+
+typedef enum {
+       TD_CMD_CREATE    = 0,
+       TD_CMD_SNAPSHOT,
+/*     TD_CMD_COALESCE,       */
+       TD_CMD_QUERY,
+/*     TD_CMD_RESIZE,         */
+       TD_CMD_SET,
+/*     TD_CMD_REPAIR,         */
+/*     TD_CMD_FILL,           */
+/*     TD_CMD_READ,           */
+       TD_CMD_INVALID,
+} td_command_t;
+
+struct command {
+       td_command_t  id;
+       char         *name;
+       int           needs_type;
+};
+
+struct command commands[TD_CMD_INVALID] = {
+       { .id = TD_CMD_CREATE,   .name = "create",   .needs_type = 1 },
+       { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 },
+/*     { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 },    */
+       { .id = TD_CMD_QUERY,    .name = "query",    .needs_type = 1 },
+/*     { .id = TD_CMD_RESIZE,   .name = "resize",   .needs_type = 1 },    */
+       { .id = TD_CMD_SET,      .name = "set",      .needs_type = 1 },
+/*     { .id = TD_CMD_REPAIR,   .name = "repair",   .needs_type = 1 },    */
+/*     { .id = TD_CMD_FILL,     .name = "fill",     .needs_type = 1 },    */
+/*     { .id = TD_CMD_READ,     .name = "read",     .needs_type = 1 },    */
+};
+
+typedef enum {
+       TD_TYPE_VHD         = 0,
+       TD_TYPE_AIO,
+       TD_TYPE_INVALID,
+} td_disk_t;
+
+const char *td_disk_types[TD_TYPE_INVALID] = {
+       "vhd",
+       "aio",
+};
+
+#define print_commands()                                               \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "COMMAND := { ");                       \
+               fprintf(stderr, "%s", commands[0].name);                \
+               for (i = 1; i < TD_CMD_INVALID; i++)                    \
+                       fprintf(stderr, " | %s", commands[i].name);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+#define print_disk_types()                                             \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "TYPE := { ");                          \
+               fprintf(stderr, "%s", td_disk_types[0]);                \
+               for (i = 1; i < TD_TYPE_INVALID; i++)                   \
+                       fprintf(stderr, " | %s", td_disk_types[i]);     \
+               fprintf(stderr, " }\n");                                \
+       } while (0);
+
+#define print_field_names()                                            \
+       do {                                                            \
+               int i;                                                  \
+               fprintf(stderr, "FIELD := { ");                         \
+               fprintf(stderr, "%s", td_vdi_fields[0].name);           \
+               for (i = 1; i < TD_FIELD_INVALID; i++)                  \
+                       fprintf(stderr, " | %s", td_vdi_fields[i].name); \
+               fprintf(stderr, " }\n");                                \
+       } while (0)
+
+void 
+help(void)
+{
+       fprintf(stderr, "Tapdisk Utilities: v1.0.0\n");
+       fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n");
+       print_commands();
+       print_disk_types();
+       exit(-1);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i;
+
+       for (i = 0; i < TD_CMD_INVALID; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+struct vdi_field *
+get_field(char *field)
+{
+       int i;
+
+       for (i = 0; i < TD_FIELD_INVALID; i++)
+               if (!strcmp(field, td_vdi_fields[i].name))
+                       return &td_vdi_fields[i];
+
+       return NULL;
+}
+
+int
+get_driver_type(char *type)
+{
+       int i;
+
+       if (strnlen(type, 25) >= 25)
+               return -ENAMETOOLONG;
+
+       for (i = 0; i < TD_TYPE_INVALID; i++)
+               if (!strcmp(type, td_disk_types[i]))
+                       return i;
+
+       return -TD_TYPE_INVALID;
+}
+
+int
+td_create(int type, int argc, char *argv[])
+{
+       ssize_t mb;
+       uint64_t size;
+       char *name, *buf;
+       int c, i, fd, sparse = 1, fixedsize = 0;
+
+       while ((c = getopt(argc, argv, "hrb")) != -1) {
+               switch(c) {
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2))
+               goto usage;
+
+       mb   = 1 << 20;
+       size = atoi(argv[optind++]);
+       size = size << 20;
+       name = argv[optind];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               int cargc = 0;
+               char sbuf[32], *cargv[10];
+
+               size >>= 20;
+
+               memset(cargv, 0, sizeof(cargv));
+               snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size);
+               cargv[cargc++] = "create";
+               cargv[cargc++] = "-n";
+               cargv[cargc++] = name;
+               cargv[cargc++] = "-s";
+               cargv[cargc++] = sbuf;
+               if (!sparse)
+                       cargv[cargc++] = "-r";
+               if (fixedsize)
+                       cargv[cargc++] = "-b";
+
+               return vhd_util_create(cargc, cargv);
+       }
+
+       /* generic create */
+       if (sparse) {
+               fprintf(stderr, "Cannot create sparse %s image\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       buf = calloc(1, mb);
+       if (!buf)
+               return ENOMEM;
+
+       fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644);
+       if (fd == -1) {
+               free(buf);
+               return errno;
+       }
+
+       size >>= 20;
+       for (i = 0; i < size; i++)
+               if (write(fd, buf, mb) != mb) {
+                       close(fd);
+                       unlink(name);
+                       free(buf);
+                       return EIO;
+               }
+
+       close(fd);
+       free(buf);
+       return 0;
+
+ usage:
+       fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] "
+               "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n",
+               td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_snapshot(int type, int argc, char *argv[])
+{
+       char *cargv[10];
+       int c, err, cargc;
+       struct stat stats;
+       char *name, *backing, *limit = NULL;
+       int fixedsize = 0, rawparent = 0;
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "hbml:")) != -1) {
+               switch(c) {
+               case 'b':
+                       fixedsize = 1;
+                       break;
+               case 'm':
+                       rawparent = 1;
+                       break;
+               case 'l':
+                       limit = optarg;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 2)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name    = argv[optind++];
+       backing = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN ||
+           strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (stat(backing, &stats) == -1) {
+               fprintf(stderr, "File %s not found\n", backing);
+               return errno;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "snapshot";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-p";
+       cargv[cargc++] = backing;
+       if (fixedsize)
+               cargv[cargc++] = "-b";
+       if (rawparent)
+               cargv[cargc++] = "-m";
+       if (limit) {
+               cargv[cargc++] = "-l";
+               cargv[cargc++] = limit;
+       }
+       return vhd_util_snapshot(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] "
+               "[-b file_is_fixed_size] [-l snapshot depth limit] "
+               "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_coalesce(int type, int argc, char *argv[])
+{
+       int c, ret, cargc;
+       char *name, *cargv[3];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot create snapshot of %s image type\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1))
+               goto usage;
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "coalesce";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       ret = vhd_util_coalesce(cargc, cargv);
+       if (ret)
+               printf("coalesce failed: %d\n", ret);
+
+       return ret;
+
+ usage:
+       fprintf(stderr, "usage: td-util coalesce %s [-h help] "
+               "<FILENAME>\n", td_disk_types[type]);
+       return EINVAL;
+}
+
+int
+td_query(int type, int argc, char *argv[])
+{
+       char *name;
+       int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0;
+
+       while ((c = getopt(argc, argv, "hvpfd")) != -1) {
+               switch(c) {
+               case 'v':
+                       size = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 1)) {
+               err = EINVAL;
+               goto usage;
+       }
+
+       name = argv[optind++];
+
+       if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) {
+               fprintf(stderr, "Device name too long\n");
+               return ENAMETOOLONG;
+       }
+
+       if (type == TD_TYPE_VHD) {
+               vhd_context_t vhd;
+
+               err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+               if (err) {
+                       printf("failed opening %s: %d\n", name, err);
+                       return err;
+               }
+
+               if (size)
+                       printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+               if (parent) {
+                       if (vhd.footer.type != HD_TYPE_DIFF)
+                               printf("%s has no parent\n", name);
+                       else {
+                               char *pname;
+
+                               err = vhd_parent_locator_get(&vhd, &pname);
+                               if (err)
+                                       printf("failed getting parent: %d\n",
+                                              err);
+                               else {
+                                       printf("%s\n", pname);
+                                       free(pname);
+                               }
+                       }
+               }
+
+               if (fields) {
+                       int ret, hidden;
+
+                       ret = vhd_hidden(&vhd, &hidden);
+                       if (ret) {
+                               printf("failed checking 'hidden' field: %d\n",
+                                      ret);
+                               err = (err ? : ret);
+                       } else
+                               printf("%s: %d\n",
+                                      td_vdi_fields[TD_FIELD_HIDDEN].name,
+                                      hidden);
+               }
+
+               if (depth) {
+                       int ret, length;
+
+                       ret = vhd_chain_depth(&vhd, &length);
+                       if (ret)
+                               printf("error checking chain depth: %d\n", ret);
+                       else
+                               printf("chain depth: %d\n", length);
+
+                       err = (err ? : ret);
+               }
+
+               vhd_close(&vhd);
+
+       } else if (type == TD_TYPE_AIO) {
+               if (size) {
+                       int fd;
+                       uint64_t secs;
+                       uint32_t ssize;
+
+                       fd = open(name, O_RDONLY | O_LARGEFILE);
+                       if (fd == -1) {
+                               printf("failed opening %s: %d\n", name, errno);
+                               return -errno;
+                       }
+
+                       err = tapdisk_get_image_size(fd, &secs, &ssize);
+                       close(fd);
+
+                       if (err) {
+                               printf("failed getting size for %s: %d\n:",
+                                      name, err);
+                               return err;
+                       }
+
+                       printf("%"PRIu64"\n", secs >> 11);
+               }
+
+               if (parent)
+                       printf("%s has no parent\n", name);
+
+               if (fields) {
+                       int i;
+
+                       for (i = 0; i < TD_FIELD_INVALID; i++)
+                               printf("%s: 0\n", td_vdi_fields[i].name);
+               }
+       }
+
+       return err;
+
+ usage:
+       fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] "
+               "[-p parent] [-f fields]  <FILENAME>\n", td_disk_types[type]);
+       return err;
+}
+
+int
+td_set_field(int type, int argc, char *argv[])
+{
+       int c, cargc;
+       struct vdi_field *field;
+       char *name, *value, *cargv[7];
+
+       if (type != TD_TYPE_VHD) {
+               fprintf(stderr, "Cannot set fields of %s images\n",
+                       td_disk_types[type]);
+               return EINVAL;
+       }
+
+       while ((c = getopt(argc, argv, "h")) != -1) {
+               switch(c) {
+               default:
+                       fprintf(stderr, "Unknown option %c\n", (char)c);
+               case 'h':
+                       goto usage;
+               }
+       }
+
+       if (optind != (argc - 3))
+               goto usage;
+
+       name  = argv[optind++];
+
+       field = get_field(argv[optind]);
+       if (!field || field->id != TD_FIELD_HIDDEN) {
+               fprintf(stderr, "Invalid field %s\n", argv[optind]);
+               goto usage;
+       }
+
+       value = argv[++optind];
+
+       cargc = 0;
+       memset(cargv, 0, sizeof(cargv));
+       cargv[cargc++] = "set";
+       cargv[cargc++] = "-n";
+       cargv[cargc++] = name;
+       cargv[cargc++] = "-f";
+       cargv[cargc++] = field->name;
+       cargv[cargc++] = "-v";
+       cargv[cargc++] = value;
+       return vhd_util_set_field(cargc, cargv);
+
+ usage:
+       fprintf(stderr, "usage: td-util set %s [-h help] "
+               "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]);
+       print_field_names();
+       return EINVAL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, type = -1, ret = 0;
+
+#ifdef CORE_DUMP
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       if (cmd->needs_type) {
+               if (argc < 3) {
+                       fprintf(stderr, "td-util %s requires a TYPE\n",
+                               cmd->name);
+                       print_disk_types();
+                       exit(-1);
+               }
+
+               type = get_driver_type(argv[2]);
+               if (type < 0) {
+                       fprintf(stderr, "invalid TYPE '%s'.\n", argv[2]);
+                       print_disk_types();
+                       exit(-1);
+               }
+               --cargc;
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++)
+               cargv[i] = argv[i + (argc - cargc)];
+
+       switch(cmd->id) {
+       case TD_CMD_CREATE:
+               ret = td_create(type, cargc, cargv);
+               break;
+       case TD_CMD_SNAPSHOT:
+               ret = td_snapshot(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_COALESCE:
+               ret = td_coalesce(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_QUERY:
+               ret = td_query(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_RESIZE:
+               ret = td_resize(type, cargc, cargv);
+               break;
+*/
+       case TD_CMD_SET:
+               ret = td_set_field(type, cargc, cargv);
+               break;
+/*
+       case TD_CMD_REPAIR:
+               ret = td_repair(type, cargc, cargv);
+               break;
+       case TD_CMD_FILL:
+               ret = td_fill(type, cargc, cargv);
+               break;
+       case TD_CMD_READ:
+               ret = td_read(type, cargc, cargv);
+               break;
+*/
+       default:
+       case TD_CMD_INVALID:
+               ret = EINVAL;
+               break;
+       }
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}
diff --git a/tools/blktap3/include/libvhd-index.h 
b/tools/blktap3/include/libvhd-index.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/include/libvhd-index.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LIB_VHDI_H_
+#define _LIB_VHDI_H_
+
+#include <inttypes.h>
+#include <uuid/uuid.h>
+
+#define VHD_MAX_NAME_LEN                    1024
+
+typedef struct vhdi_context                 vhdi_context_t;
+typedef struct vhdi_bat                     vhdi_bat_t;
+typedef struct vhdi_block                   vhdi_block_t;
+typedef struct vhdi_entry                   vhdi_entry_t;
+typedef uint32_t                            vhdi_file_id_t;
+typedef struct vhdi_file_ref                vhdi_file_ref_t;
+typedef struct vhdi_file_table              vhdi_file_table_t;
+
+struct vhdi_context {
+       int                                 fd;
+       int                                 spb;
+       char                               *name;
+       uint32_t                            vhd_block_size;
+};
+
+struct vhdi_bat {
+       uint32_t                           *table;
+       uint64_t                            vhd_blocks;
+       uint32_t                            vhd_block_size;
+       char                                vhd_path[VHD_MAX_NAME_LEN];
+       char                                index_path[VHD_MAX_NAME_LEN];
+       char                                file_table_path[VHD_MAX_NAME_LEN];
+};
+
+struct vhdi_entry {
+       vhdi_file_id_t                      file_id;
+       uint32_t                            offset;
+};
+
+struct vhdi_block {
+       int                                 entries;
+       vhdi_entry_t                       *table;
+};
+
+struct vhdi_file_ref {
+       vhdi_file_id_t                      file_id;
+       char                               *path;
+       uuid_t                              vhd_uuid;
+       uint32_t                            vhd_timestamp;
+};
+
+struct vhdi_file_table {
+       int                                 entries;
+       vhdi_file_ref_t                    *table;
+};
+
+void vhdi_entry_in(vhdi_entry_t *);
+
+int vhdi_create(const char *, uint32_t);
+int vhdi_open(vhdi_context_t *, const char *, int);
+void vhdi_close(vhdi_context_t *);
+int vhdi_read_block(vhdi_context_t *, vhdi_block_t *, uint32_t);
+int vhdi_write_block(vhdi_context_t *, vhdi_block_t *, uint32_t);
+int vhdi_append_block(vhdi_context_t *, vhdi_block_t *, uint32_t *);
+
+int vhdi_bat_create(const char *, const char *, const char *, const char *);
+int vhdi_bat_load(const char *, vhdi_bat_t *);
+int vhdi_bat_write(const char *, vhdi_bat_t *);
+
+int vhdi_file_table_create(const char *);
+int vhdi_file_table_load(const char *, vhdi_file_table_t *);
+int vhdi_file_table_add(const char *, const char *, vhdi_file_id_t *);
+void vhdi_file_table_free(vhdi_file_table_t *);
+
+#endif
diff --git a/tools/blktap3/include/libvhd-journal.h 
b/tools/blktap3/include/libvhd-journal.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/include/libvhd-journal.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _VHD_JOURNAL_H_
+#define _VHD_JOURNAL_H_
+
+#include <inttypes.h>
+
+#include "libvhd.h"
+
+#define VHD_JOURNAL_METADATA       0x01
+#define VHD_JOURNAL_DATA           0x02
+
+#define VHD_JOURNAL_HEADER_COOKIE  "vjournal"
+#define VHD_JOURNAL_ENTRY_COOKIE   0xaaaa12344321aaaaULL
+
+typedef struct vhd_journal_header {
+       char                       cookie[8];
+       uuid_t                     uuid;
+       uint64_t                   vhd_footer_offset;
+       uint32_t                   journal_data_entries;
+       uint32_t                   journal_metadata_entries;
+       uint64_t                   journal_data_offset;
+       uint64_t                   journal_metadata_offset;
+       uint64_t                   journal_eof;
+       char                       pad[448];
+} vhd_journal_header_t;
+
+typedef struct vhd_journal {
+       char                      *jname;
+       int                        jfd;
+       int                        is_block; /* is jfd a block device */
+       vhd_journal_header_t       header;
+       vhd_context_t              vhd;
+} vhd_journal_t;
+
+int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile);
+int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode);
+int vhd_journal_commit(vhd_journal_t *);
+int vhd_journal_revert(vhd_journal_t *);
+int vhd_journal_close(vhd_journal_t *);
+int vhd_journal_remove(vhd_journal_t *);
+
+#endif
diff --git a/tools/blktap3/include/tapdisk-message.h 
b/tools/blktap3/include/tapdisk-message.h
--- a/tools/blktap3/include/tapdisk-message.h
+++ b/tools/blktap3/include/tapdisk-message.h
@@ -30,6 +30,10 @@
 #include <inttypes.h>
 #include <sys/types.h>
 
+/* TODO Why do we have two of them? */
+/* TODO This is quite small since we don't allow path bigger than 256 chars. If
+ * we ever increase this, make sure tapdisk_message_t structures are not
+ * allocated on the stack. */
 #define TAPDISK_MESSAGE_MAX_PATH_LENGTH  256
 #define TAPDISK_MESSAGE_STRING_LENGTH    256
 
@@ -145,6 +149,31 @@ struct tapdisk_message_blkif {
     char params[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
 };
 
+/**
+ * Contains parameters for resuming a previously paused VBD.
+ */
+typedef struct tapdisk_message_resume {
+    /**
+     * TODO
+     */
+       tapdisk_message_flag_t flags;
+
+    /**
+     * The VDI (type:/path/to/file) to pause.
+     */
+    char params1[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+
+    /**
+     * A new VDI to use instead of the old one. Optional.
+     */
+    char params2[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+
+    /**
+     * TODO
+     */
+    char secondary[TAPDISK_MESSAGE_MAX_PATH_LENGTH];
+} tapdisk_message_resume_t;
+
 struct tapdisk_message {
     /**
      * TAPDISK_MESSAGE_???
@@ -161,6 +190,7 @@ struct tapdisk_message {
                tapdisk_message_list_t   list;
                tapdisk_message_stat_t   info;
                tapdisk_message_blkif_t  blkif;
+        tapdisk_message_resume_t resume;
        } u;
 };
 
diff --git a/tools/blktap3/include/vhd-util.h b/tools/blktap3/include/vhd-util.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/include/vhd-util.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _VHD_UTIL_H_
+#define _VHD_UTIL_H_
+
+int vhd_util_create(int argc, char **argv);
+int vhd_util_snapshot(int argc, char **argv);
+int vhd_util_query(int argc, char **argv);
+int vhd_util_read(int argc, char **argv);
+int vhd_util_set_field(int argc, char **argv);
+int vhd_util_repair(int argc, char **argv);
+int vhd_util_fill(int argc, char **argv);
+int vhd_util_resize(int argc, char **argv);
+int vhd_util_coalesce(int argc, char **argv);
+int vhd_util_modify(int argc, char **argv);
+int vhd_util_scan(int argc, char **argv);
+int vhd_util_check(int argc, char **argv);
+int vhd_util_revert(int argc, char **argv);
+
+#endif
diff --git a/tools/blktap3/lvm/Makefile b/tools/blktap3/lvm/Makefile
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/lvm/Makefile
@@ -0,0 +1,45 @@
+XEN_ROOT = $(CURDIR)/../../..
+BLKTAP_ROOT := ..
+include $(XEN_ROOT)/tools/Rules.mk
+
+ifeq ($(LVM_UTIL_TEST),y)
+TEST              := lvm-util
+endif
+
+override CFLAGS += \
+    -I$(BLKTAP_ROOT)/include \
+    -D_GNU_SOURCE \
+    -Wall \
+    -Wextra \
+    -Werror
+
+# FIXME cause trouble
+override CFLAGS += \
+    -Wno-sign-compare
+
+# FIXME Why only on 64-bit?
+ifeq ($(CONFIG_X86_64),y)
+CFLAGS            += -fPIC
+endif
+
+LVM-OBJS          := lvm-util.o
+
+all: build liblvm.a
+
+build: $(TEST) $(LVM-OBJS)
+
+# FIXME lvm-util not installed somewhere
+install: all
+
+lvm-util: lvm-util.o
+       $(CC) -DLVM_UTIL $(LDFLAGS) -o lvm-util lvm-util.c
+
+liblvm.a: $(LVM-OBJS)
+       $(AR) rc $@ $^
+
+clean:
+       rm -rf *.o *.opic *~ $(DEPS) $(IBIN)
+
+.PHONY: all build clean install lvm-util
+
+-include $(DEPS)
diff --git a/tools/blktap3/lvm/lvm-util.c b/tools/blktap3/lvm/lvm-util.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/lvm/lvm-util.c
@@ -0,0 +1,375 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syslog.h>
+
+#include "lvm-util.h"
+
+#define EPRINTF(_f, _a...)                                     \
+       do {                                                    \
+               syslog(LOG_INFO, "%s: " _f, __func__, ##_a);    \
+       } while (0)
+
+#define _NAME "%255s"
+static char line[1024];
+
+static inline int
+lvm_read_line(FILE *scan)
+{
+       memset(line, 0, sizeof(line));
+       return (fscanf(scan, "%1023[^\n]", line) != 1);
+}
+
+static inline int
+lvm_next_line(FILE *scan)
+{
+       return (fscanf(scan, "%1023[\n]", line) != 1);
+}
+
+static int
+lvm_copy_name(char *dst, const char *src, size_t size)
+{
+       if (strnlen(src, size) == size)
+               return -ENAMETOOLONG;
+
+       strcpy(dst, src);
+       return 0;
+}
+
+static int
+lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start)
+{
+       int i, err;
+       struct pv *pv;
+
+       pv = NULL;
+
+       if (!vg->pvs) {
+               vg->pvs = calloc(pvs, sizeof(struct pv));
+               if (!vg->pvs)
+                       return -ENOMEM;
+       }
+
+       for (i = 0; i < pvs; i++) {
+               pv = vg->pvs + i;
+
+               if (!pv->name[0])
+                       break;
+
+               if (!strcmp(pv->name, name))
+                       return -EEXIST;
+       }
+
+       if (!pv)
+               return -ENOENT;
+
+       if (i == pvs)
+               return -ENOMEM;
+
+       err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1);
+       if (err)
+               return err;
+
+       pv->start = start;
+       return 0;
+}
+
+static int
+lvm_open_vg(const char *vgname, struct vg *vg)
+{
+       FILE *scan;
+       int i, err, pvs, lvs;
+       char *cmd, pvname[256];
+       uint64_t size, pv_start;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix 
--units=b "
+                      "--options=vg_name,vg_extent_size,lv_count,pv_count,"
+                      "pv_name,pe_start --unbuffered 2> /dev/null", vgname);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : ENOMEM);
+               goto out;
+       }
+
+       for (;;) {
+               if (lvm_read_line(scan))
+                       break;
+
+               err = -EINVAL;
+               if (sscanf(line, _NAME" %"PRIu64" %d %d "_NAME" %"PRIu64, 
vg->name,
+                          &size, &lvs, &pvs, pvname, &pv_start) != 6) {
+                       EPRINTF("sscanf failed on '%s'\n", line);
+                       goto out;
+               }
+
+               if (strcmp(vg->name, vgname)) {
+                       EPRINTF("VG name '%s' != '%s'\n", vg->name, vgname);
+                       goto out;
+               }
+               err = lvm_parse_pv(vg, pvname, pvs, pv_start);
+               if (err)
+                       goto out;
+
+               if (lvm_next_line(scan))
+                       break;
+       }
+
+       err = -EINVAL;
+       if (strcmp(vg->name, vgname)) {
+               EPRINTF("VG name '%s' != '%s'\n", vg->name, vgname);
+               goto out;
+       }
+
+       for (i = 0; i < pvs; i++)
+               if (!vg->pvs[i].name[0]) {
+                       EPRINTF("pvs %d name empty\n", i);
+                       goto out;
+               }
+
+       err = -ENOMEM;
+       vg->lvs = calloc(lvs, sizeof(struct lv));
+       if (!vg->lvs)
+               goto out;
+
+       err             = 0;
+       vg->lv_cnt      = lvs;
+       vg->pv_cnt      = pvs;
+       vg->extent_size = size;
+
+out:
+       if (scan)
+               pclose(scan);
+       if (err)
+               lvm_free_vg(vg);
+       free(cmd);
+       return err;
+}
+
+static int
+lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices)
+{
+       int i;
+       uint64_t start, pe_start;
+
+       for (i = 0; i < strlen(devices); i++)
+               if (strchr(",()", devices[i]))
+                       devices[i] = ' ';
+
+       if (sscanf(devices, _NAME" %"PRIu64, seg->device, &start) != 2) {
+               EPRINTF("sscanf failed on '%s'\n", devices);
+               return -EINVAL;
+       }
+
+       pe_start = -1;
+       for (i = 0; i < vg->pv_cnt; i++)
+               if (!strcmp(vg->pvs[i].name, seg->device)) {
+                       pe_start = vg->pvs[i].start;
+                       break;
+               }
+
+       if (pe_start == -1) {
+               EPRINTF("invalid pe_start value\n");
+               return -EINVAL;
+       }
+
+       seg->pe_start = (start * vg->extent_size) + pe_start;
+       return 0;
+}
+
+static int
+lvm_scan_lvs(struct vg *vg)
+{
+       char *cmd;
+       FILE *scan;
+       int i, err;
+
+       err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix 
--units=b "
+                      "--options=lv_name,lv_size,segtype,seg_count,seg_start,"
+                      "seg_size,devices --unbuffered 2> /dev/null", vg->name);
+       if (err == -1)
+               return -ENOMEM;
+
+       errno = 0;
+       scan  = popen(cmd, "r");
+       if (!scan) {
+               err = (errno ? -errno : -ENOMEM);
+               goto out;
+       }
+
+       for (i = 0;;) {
+               int segs;
+               struct lv *lv;
+               struct lv_segment seg;
+               unsigned long long size, seg_start;
+               char type[32], name[256], devices[1024];
+
+               if (i >= vg->lv_cnt)
+                       break;
+
+               if (lvm_read_line(scan)) {
+                       vg->lv_cnt = i;
+                       break;
+               }
+
+               err = -EINVAL;
+               lv  = vg->lvs + i;
+
+               if (sscanf(line, _NAME" %llu %31s %u %llu %"PRIu64" %1023s",
+                          name, &size, type, &segs, &seg_start,
+                          &seg.pe_size, devices) != 7) {
+                       EPRINTF("sscanf failed on '%s'\n", line);
+                       goto out;
+               }
+
+               if (seg_start)
+                       goto next;
+
+               if (!strcmp(type, "linear"))
+                       seg.type = LVM_SEG_TYPE_LINEAR;
+               else
+                       seg.type = LVM_SEG_TYPE_UNKNOWN;
+
+               if (lvm_parse_lv_devices(vg, &seg, devices))
+                       goto out;
+
+               i++;
+               lv->size          = size;
+               lv->segments      = segs;
+               lv->first_segment = seg;
+
+               err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1);
+               if (err)
+                       goto out;
+               err = -EINVAL;
+
+       next:
+               if (lvm_next_line(scan)) {
+                       if (err)
+                               EPRINTF("fscanf failed\n");
+                       goto out;
+               }
+       }
+
+       err = 0;
+
+out:
+       if (scan)
+               pclose(scan);
+       free(cmd);
+       return err;
+}
+
+void
+lvm_free_vg(struct vg *vg)
+{
+       free(vg->lvs);
+       free(vg->pvs);
+       memset(vg, 0, sizeof(*vg));
+}
+
+int
+lvm_scan_vg(const char *vg_name, struct vg *vg)
+{
+       int err;
+
+       memset(vg, 0, sizeof(*vg));
+
+       err = lvm_open_vg(vg_name, vg);
+       if (err)
+               return err;
+
+       err = lvm_scan_lvs(vg);
+       if (err) {
+               lvm_free_vg(vg);
+               return err;
+       }
+
+       return 0;
+}
+
+#ifdef LVM_UTIL
+static int
+usage(void)
+{
+       printf("usage: lvm-util <vgname>\n");
+       exit(EINVAL);
+}
+
+int
+main(int argc, char **argv)
+{
+       int i, err;
+       struct vg vg;
+       struct pv *pv;
+       struct lv *lv;
+       struct lv_segment *seg;
+
+       if (argc != 2)
+               usage();
+
+       err = lvm_scan_vg(argv[1], &vg);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return (err >= 0 ? err : -err);
+       }
+
+       printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n",
+              vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt);
+
+       for (i = 0; i < vg.pv_cnt; i++) {
+               pv = vg.pvs + i;
+               printf("pv %s: start %"PRIu64"\n", pv->name, pv->start);
+       }
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               lv  = vg.lvs + i;
+               seg = &lv->first_segment;
+               printf("lv %s: size: %"PRIu64", segments: %u, type: %u, "
+                      "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n",
+                      lv->name, lv->size, lv->segments, seg->type,
+                      seg->device, seg->pe_start, seg->pe_size);
+       }
+
+       lvm_free_vg(&vg);
+       return 0;
+}
+#endif
diff --git a/tools/blktap3/lvm/lvm-util.h b/tools/blktap3/lvm/lvm-util.h
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/lvm/lvm-util.h
@@ -0,0 +1,71 @@
+/* 
+ * Copyright (c) 2008, XenSource Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef _LVM_UTIL_H_
+#define _LVM_UTIL_H_
+
+#include <inttypes.h>
+
+#define MAX_NAME_SIZE            256
+
+#define LVM_SEG_TYPE_LINEAR      1
+#define LVM_SEG_TYPE_UNKNOWN     2
+
+struct lv_segment {
+       uint8_t                  type;
+       char                     device[MAX_NAME_SIZE];
+       uint64_t                 pe_start;
+       uint64_t                 pe_size;
+};
+
+struct lv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 size;
+       uint32_t                 segments;
+       struct lv_segment        first_segment;
+};
+
+struct pv {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 start;
+};
+
+struct vg {
+       char                     name[MAX_NAME_SIZE];
+       uint64_t                 extent_size;
+
+       int                      pv_cnt;
+       struct pv               *pvs;
+
+       int                      lv_cnt;
+       struct lv               *lvs;
+};
+
+int lvm_scan_vg(const char *vg_name, struct vg *vg);
+void lvm_free_vg(struct vg *vg);
+
+#endif
diff --git a/tools/blktap3/part/Makefile b/tools/blktap3/part/Makefile
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/part/Makefile
@@ -0,0 +1,34 @@
+XEN_ROOT := $(CURDIR)/../../../
+include $(XEN_ROOT)/tools/Rules.mk
+
+BLKTAP_ROOT := ..
+
+IBIN = part-util
+
+override CFLAGS += \
+       -I$(BLKTAP_ROOT)/include \
+       $(CFLAGS_xeninclude) \
+    -Wall \
+    -Wextra \
+    -Werror
+
+# FIXME cause trouble
+override CFLAGS += \
+    -Wno-sign-compare
+
+PART-OBJS := partition.o
+
+all: $(IBIN) vhdpartx
+
+$(IBIN): $(PART-OBJS) part-util.o
+       $(CC) -o $@ $^ $(LDFLAGS)
+
+install: all
+       $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR)
+       $(INSTALL_PROG) vhdpartx $(DESTDIR)$(BINDIR)
+
+clean:
+       rm -f *.o .*.o.d $(IBIN) $(PART-OBJS)
+
+.PHONY: clean
diff --git a/tools/blktap3/part/part-util.c b/tools/blktap3/part/part-util.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/part/part-util.c
@@ -0,0 +1,369 @@
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <time.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <endian.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <byteswap.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <linux/hdreg.h>
+
+#include "partition.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  #define cpu_to_le32(x) (x)
+  #define cpu_to_le64(x) (x)
+#else
+  #define cpu_to_le32(x) bswap_32(x)
+  #define cpu_to_le64(x) bswap_64(x)
+#endif
+
+static void
+usage(const char *app)
+{
+       printf("usage: %s <-i image> "
+              "[-d dump] [-c count] [-f format] "
+              "[-t type] [-s sig <part>]\n", app);
+}
+
+static void
+chs_unpack(struct partition_chs *c,
+          uint8_t *head, uint8_t *sector, uint16_t *cylinder)
+{
+       *head = c->chs[0];
+       *sector = c->chs[1] & 0x3f;
+       *cylinder = (c->chs[1] & 0xc0) * 4 + c->chs[2];
+}
+
+void
+partition_table_dump(struct partition_table *pt)
+{
+       int i;
+
+       printf("disk signature   0x%08x\n", pt->disk_signature);
+       printf("mbr signature    0x%04x\n", pt->mbr_signature);
+       printf("\n");
+
+       for (i = 0; i < 4; i++) {
+               struct primary_partition *p = pt->partitions + i;
+               uint8_t head, sector;
+               uint16_t cylinder;
+
+               printf("  %d status       0x%02x\n", i, p->status);
+
+               chs_unpack(&p->chs_first, &head, &sector, &cylinder);
+               printf("  %d s cylinder   0x%04x\n", i, cylinder);
+               printf("  %d s sector     0x%01x\n", i, sector);
+               printf("  %d s head       0x%01x\n", i, head);
+
+               printf("  %d type         0x%01x\n", i, p->type);
+
+               chs_unpack(&p->chs_last, &head, &sector, &cylinder);
+               printf("  %d e cylinder   0x%04x\n", i, cylinder);
+               printf("  %d e sector     0x%01x\n", i, sector);
+               printf("  %d e head       0x%01x\n", i, head);
+
+               printf("  %d lba          0x%08x\n", i, p->lba);
+               printf("  %d blocks       0x%08x\n", i, p->blocks);
+
+               printf("\n");
+       }
+}
+
+static int
+dump_partitions(const char *image)
+{
+       int fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               errno = EINVAL;
+               printf("table invalid\n");
+               goto out;
+       }
+
+       partition_table_dump(&pt);
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+static void
+__dump_signature(struct partition_table *pt, int part)
+{
+       if (part < 1 || part > 4)
+               errno = EINVAL;
+       else {
+               uint8_t *p, *s;
+               uint32_t sig = pt->disk_signature;
+               uint64_t off = (uint64_t)pt->partitions[part - 1].lba << 9;
+
+               sig = cpu_to_le32(sig);
+               off = cpu_to_le64(off);
+
+               for (p = s = (uint8_t *)&sig; p - s < sizeof(sig); p++)
+                       printf("%02x", *p);
+
+               for (p = s = (uint8_t *)&off; p - s < sizeof(off); p++)
+                       printf("%02x", *p);
+
+               printf("\n");
+       }
+}
+
+static int
+dump_signature(const char *image, int part)
+{
+       int fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               errno = EINVAL;
+               printf("table invalid\n");
+               goto out;
+       }
+
+       __dump_signature(&pt, part);
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+static int
+count_partitions(const char *image, int *count)
+{
+       int i, fd, ret;
+       struct partition_table pt;
+
+       ret = 1;
+       fd  = -1;
+
+       fd = open(image, O_RDONLY);
+       if (fd == -1)
+               goto out;
+
+       if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       partition_table_in(&pt);
+       if (partition_table_validate(&pt)) {
+               *count = 0;
+               goto done;
+       }
+
+       *count = 0;
+       for (i = 0; i < 4; i++)
+               if (pt.partitions[i].type)
+                       (*count)++;
+
+done:
+       ret = 0;
+out:
+       close(fd);
+       return ret;
+}
+
+static int
+format_partition(const char *image, int type, struct partition_table *pt)
+{
+       uint64_t lend;
+       uint32_t start, end;
+       int ret, sec_size, fd;
+       unsigned int cylinders;
+       struct hd_geometry geo;
+       struct primary_partition *pp;
+       struct partition_geometry pgeo;
+       unsigned long long bytes, llcyls;
+
+       ret = 1;
+       fd  = -1;
+
+       memset(pt, 0, sizeof(*pt));
+       pp = pt->partitions;
+
+       srandom(time(NULL));
+
+       fd = open(image, O_RDWR);
+       if (fd == -1)
+               goto out;
+
+       if (ioctl(fd, HDIO_GETGEO, &geo))
+               goto out;
+
+       if (ioctl(fd, BLKGETSIZE64, &bytes))
+               goto out;
+
+       if (ioctl(fd, BLKSSZGET, &sec_size))
+               goto out;
+
+       llcyls = (bytes >> 9) / ((sec_size >> 9) * geo.heads * geo.sectors);
+       cylinders = llcyls;
+       if (cylinders != llcyls)
+               cylinders = ~0;
+
+       pgeo.heads          = geo.heads;
+       pgeo.sectors        = geo.sectors;
+       pgeo.cylinders      = cylinders;
+
+       start               = pgeo.sectors;
+       lend                = geo.heads * geo.sectors * llcyls - 1;
+
+       end = lend;
+       if (end != lend)
+               end = ~0;
+
+       pp->status          = PARTITION_BOOTABLE;
+       pp->type            = type;
+       pp->lba             = start;
+       pp->blocks          = end - start + 1;
+       pp->chs_first       = lba_to_chs(&pgeo, start);
+       pp->chs_last        = lba_to_chs(&pgeo, lend);
+
+       pt->mbr_signature   = MBR_SIGNATURE;
+       pt->disk_signature  = random();
+
+       partition_table_out(pt);
+       if (write(fd, pt, sizeof(*pt)) != sizeof(*pt)) {
+               errno = errno ? : EIO;
+               goto out;
+       }
+
+       ret = 0;
+
+out:
+       close(fd);
+       return ret;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char *image;
+       struct partition_table pt;
+       int ret, c, type, count, dump, format, signature;
+
+       ret       = 1;
+       format    = 0;
+       count     = 0;
+       dump      = 0;
+       type      = 0;
+       signature = -1;
+       image     = NULL;
+
+       while ((c = getopt(argc, argv, "i:fdt:cs:h")) != -1) {
+               switch (c) {
+               case 'i':
+                       image = optarg;
+                       break;
+               case 'c':
+                       count = 1;
+                       break;
+               case 's':
+                       signature = atoi(optarg);
+                       break;
+               case 'f':
+                       format = 1;
+                       break;
+               case 't': {
+                       int base = (!strncasecmp(optarg, "0x", 2) ? 16 : 10);
+                       type = strtol(optarg, NULL, base);
+                       break;
+               }
+               case 'd':
+                       dump = 1;
+                       break;
+               case 'h':
+                       usage(argv[0]);
+                       ret = 0;
+                       goto out;
+               }
+       }
+
+       if (!image || (!format && !count && !signature && !dump)) {
+               errno = EINVAL;
+               usage(argv[0]);
+               goto out;
+       }
+
+       if (format) {
+               if (!type) {
+                       errno = EINVAL;
+                       perror("type required");
+                       goto out;
+               }
+
+               if (format_partition(image, type, &pt)) {
+                       perror("formatting partition");
+                       goto out;
+               }
+
+               __dump_signature(&pt, 1);
+       }
+
+       if (count) {
+               if (count_partitions(image, &count)) {
+                       perror("counting partitions");
+                       goto out;
+               }
+               printf("%d\n", count);
+       }
+
+       if (signature != -1) {
+               if (dump_signature(image, signature)) {
+                       perror("dumping signature");
+                       goto out;
+               }
+       }
+
+       if (dump) {
+               if (dump_partitions(image)) {
+                       perror("dumping partitions");
+                       goto out;
+               }
+       }
+
+       ret = 0;
+
+out:
+       return ret;
+}
diff --git a/tools/blktap3/part/partition.c b/tools/blktap3/part/partition.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/part/partition.c
@@ -0,0 +1,112 @@
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <endian.h>
+#include <byteswap.h>
+
+#include "partition.h"
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+  #define le16_to_cpu(x) (x)
+  #define le32_to_cpu(x) (x)
+  #define cpu_to_le16(x) (x)
+  #define cpu_to_le32(x) (x)
+#else
+  #define le16_to_cpu(x) bswap_16(x)
+  #define le32_to_cpu(x) bswap_32(x)
+  #define cpu_to_le16(x) bswap_16(x)
+  #define cpu_to_le32(x) bswap_32(x)
+#endif
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a)[0])
+
+void
+primary_partition_in(struct primary_partition *p)
+{
+       p->lba    = le32_to_cpu(p->lba);
+       p->blocks = le32_to_cpu(p->blocks);     
+}
+
+void
+primary_partition_out(struct primary_partition *p)
+{
+       p->lba    = cpu_to_le32(p->lba);
+       p->blocks = cpu_to_le32(p->blocks);     
+}
+
+void
+partition_table_in(struct partition_table *pt)
+{
+       int i;
+
+       pt->disk_signature = le32_to_cpu(pt->disk_signature);
+       pt->mbr_signature  = le16_to_cpu(pt->mbr_signature);
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++)
+               primary_partition_in(pt->partitions + i);
+}
+
+void
+partition_table_out(struct partition_table *pt)
+{
+       int i;
+
+       pt->disk_signature = cpu_to_le32(pt->disk_signature);
+       pt->mbr_signature  = cpu_to_le16(pt->mbr_signature);
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++)
+               primary_partition_out(pt->partitions + i);
+}
+
+int
+primary_partition_validate(struct primary_partition *p)
+{
+       if (p->status != PARTITION_BOOTABLE &&
+           p->status != PARTITION_NON_BOOTABLE)
+               return EINVAL;
+
+       return 0;
+}
+
+int
+partition_table_validate(struct partition_table *pt)
+{
+       int i;
+
+       if (pt->mbr_signature != MBR_SIGNATURE)
+               return EINVAL;
+
+       for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) {
+               int err = primary_partition_validate(pt->partitions + i);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+struct partition_chs
+lba_to_chs(struct partition_geometry *geo, uint64_t lba)
+{
+       struct partition_chs c;
+
+       if (lba >= 0x3ff * geo->sectors * geo->heads) {
+               c.chs[0]  = geo->heads - 1;
+               c.chs[1]  = geo->sectors;
+               lba       = 0x3ff;
+       } else {
+               c.chs[1]  = lba % geo->sectors + 1;
+               lba      /= geo->sectors;
+
+               c.chs[0]  = lba % geo->heads;
+               lba      /= geo->heads;
+       }
+
+       c.chs[2]  = lba & 0xff;
+       c.chs[1] |= (lba >> 2) & 0xc0;
+
+       return c;
+}
diff --git a/tools/blktap3/part/vhdpartx b/tools/blktap3/part/vhdpartx
new file mode 100755
--- /dev/null
+++ b/tools/blktap3/part/vhdpartx
@@ -0,0 +1,109 @@
+#!/bin/sh
+
+set -e
+
+PARTUTIL=/usr/sbin/part-util
+LIBVHDIO=/usr/lib/libvhdio.so.1.0
+
+die()
+{
+    echo "$@"
+    exit 1
+}
+
+usage()
+{
+    echo "usage: $0 [-a | -d | -l] vhd [lib]"
+    echo "-a add partition mappings"
+    echo "-d del partition mappings"
+    echo "-l list partition mappings"
+    exit 1
+}
+
+parse_args()
+{
+    part_util=$PARTUTIL
+
+    while [ $# -ge 1 ]; do
+       case $1 in
+           -a) add="TRUE" && count="1$count";;
+           -d) del="TRUE" && count="1$count";;
+           -l) list="TRUE" && count="1$count";;
+           *) if [ -z "$vhd" ]; then vhd=$1;
+              elif [ -z "$lib" ]; then lib=$1;
+              else usage;
+              fi;;
+       esac
+       shift
+    done
+
+    [[ -z "$lib" ]] && lib=$LIBVHDIO
+    [[ -z "$vhd" || "$count" != "1" ]] && usage
+    return 0
+}
+
+# screen-scraping of fdisk... not used
+fdisk_read_partitions()
+{
+    local data=$(LD_PRELOAD=$lib fdisk -l $vhd)
+
+    local none=$(echo $data | grep "This doesn't look like a partition table")
+    [[ -n "$none" ]] && partitions=0 && return 0
+
+    partitions=4
+    while [[ "$partitions" != "0" ]]; do
+       local hit=$(echo $data | grep "${vhd}$partitions")
+       [[ -n "$hit" ]] && break
+       let partitions=$partitions-1
+    done
+}
+
+part_util_read_partitions()
+{
+    partitions=$(LD_PRELOAD=$lib $part_util -c -i $vhd)
+}
+
+list_mappings()
+{
+    local parts=1
+    while [[ $parts -le $partitions ]]; do
+       echo ${vhd}$parts
+       let parts=$parts+1
+    done
+}
+
+add_mappings()
+{
+    local parts=1
+    local path=$(realpath $vhd)
+    while [[ $parts -le $partitions ]]; do
+       [[ -e ${path}${parts} ]] || ln -s $(basename $path) ${path}$parts
+       let parts=$parts+1
+    done
+}
+
+del_mappings()
+{
+    local parts=1
+    while [[ $parts -le $partitions ]]; do
+       [[ -L ${vhd}$parts ]] && rm -f ${vhd}$parts
+       let parts=$parts+1
+    done
+}
+
+main()
+{
+    parse_args $@
+    [[ -x $part_util ]] || die "can't find part-util"
+    [[ -r $vhd && -r $lib ]] || die "can't find vhd or lib"
+
+    part_util_read_partitions
+
+    [[ -n "$add" ]] && add_mappings
+    [[ -n "$del" ]] && del_mappings
+    [[ -n "$list" ]] && list_mappings
+
+    return 0
+}
+
+main $@
diff --git a/tools/blktap3/tapback/Makefile b/tools/blktap3/tapback/Makefile
--- a/tools/blktap3/tapback/Makefile
+++ b/tools/blktap3/tapback/Makefile
@@ -3,8 +3,6 @@ include $(XEN_ROOT)/tools/Rules.mk
 
 BLKTAP_ROOT := ..
 
-INST_DIR ?= $(BINDIR)
-
 IBIN = tapback
 
 # -D_GNU_SOURCE is required by vasprintf.
@@ -39,8 +37,8 @@ all: $(IBIN)
        $(CC) -o $@ $^ $(TAPBACK-LIBS) $(LDFLAGS)
 
 install: all
-       $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR)
-       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR)
+       $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR)
+       $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR)
 
 clean:
        rm -f *.o *.o.d .*.o.d $(IBIN)
diff --git a/tools/blktap3/vhd/Makefile b/tools/blktap3/vhd/Makefile
--- a/tools/blktap3/vhd/Makefile
+++ b/tools/blktap3/vhd/Makefile
@@ -5,11 +5,40 @@ include $(XEN_ROOT)/tools/Rules.mk
 SUBDIRS-y         :=
 SUBDIRS-y         += lib
 
-all: subdirs-all
+IBIN               = vhd-util3 vhd-index3 vhd-update3
 
-LIBS_DEPENDS := lib/libvhd.so lib/libvhd.a
-$(LIBS_DEPENDS): subdirs-all
+override CFLAGS += \
+       -fno-strict-aliasing \
+       -I$(BLKTAP_ROOT)/include \
+       $(CFLAGS_libxenctrl) \
+       -D_GNU_SOURCE \
+       -DUSE_NFS_LOCKS \
+    -Werror \
+    -Wall \
+    -Wextra
 
+
+ifeq ($(VHD_STATIC),y)
+CFLAGS            += -static
+endif
+
+LIBS              := -Llib -lvhd
+LIBS              += -luuid
+
+all: subdirs-all build
+
+build: $(IBIN)
+
+vhd-util3: vhd-util.o
+       $(CC) $(CFLAGS) -o vhd-util3 vhd-util.o $(LIBS)
+
+vhd-index3: vhd-index.o
+       $(CC) $(CFLAGS) -o vhd-index3 vhd-index.o $(LIBS)
+
+vhd-update3: vhd-update.o
+       $(CC) $(CFLAGS) -o vhd-update3 vhd-update.o $(LIBS)
+
+# FIXME Must install vhd-* binaries
 install: all
        $(MAKE) subdirs-install
 
diff --git a/tools/blktap3/vhd/lib/Makefile b/tools/blktap3/vhd/lib/Makefile
--- a/tools/blktap3/vhd/lib/Makefile
+++ b/tools/blktap3/vhd/lib/Makefile
@@ -13,6 +13,7 @@ INST-DIR         = $(LIBDIR)
 override CFLAGS += \
        -I$(BLKTAP_ROOT)/include \
        -I$(BLKTAP_ROOT)/part \
+    -I$(BLKTAP_ROOT)/lvm \
        -D_GNU_SOURCE \
        -fPIC \
        $(CFLAGS_xeninclude) \
@@ -34,10 +35,29 @@ LIBS            += -liconv
 endif
 
 LIB-SRCS        := libvhd.c
-# TODO Not in Citrix blktap2, import it.
-#LIB-SRCS += vhd-util-uuid.c
 LIB-SRCS        += relative-path.c
 LIB-SRCS        += atomicio.c
+LIB-SRCS        += libvhd-index.c
+LIB-SRCS        += libvhd-journal.c
+LIB-SRCS        += vhd-util-coalesce.c
+LIB-SRCS        += vhd-util-create.c
+LIB-SRCS        += vhd-util-fill.c
+LIB-SRCS        += vhd-util-modify.c
+LIB-SRCS        += vhd-util-query.c
+LIB-SRCS        += vhd-util-read.c
+LIB-SRCS        += vhd-util-repair.c
+LIB-SRCS        += vhd-util-resize.c
+LIB-SRCS        += vhd-util-revert.c
+LIB-SRCS        += vhd-util-set-field.c
+LIB-SRCS        += vhd-util-snapshot.c
+LIB-SRCS        += vhd-util-scan.c
+LIB-SRCS        += vhd-util-check.c
+
+# FIXME hack, make it a shared lib
+LIB-SRCS        += $(BLKTAP_ROOT)/lvm/lvm-util.c
+
+# TODO Not in blktap2.5.
+#LIB-SRCS += vhd-util-uuid.c
 
 LIB-OBJS         = $(patsubst %.c,%.o,$(LIB-SRCS))
 
diff --git a/tools/blktap3/vhd/lib/atomicio.c b/tools/blktap3/vhd/lib/atomicio.c
--- a/tools/blktap3/vhd/lib/atomicio.c
+++ b/tools/blktap3/vhd/lib/atomicio.c
@@ -40,7 +40,7 @@ atomicio(f, fd, _s, n)
 {
        char *s = _s;
        size_t pos = 0;
-       size_t res;
+       ssize_t res;
 
        while (n > pos) {
                res = (f) (fd, s + pos, n - pos);
diff --git a/tools/blktap3/vhd/lib/libvhd-journal.c 
b/tools/blktap3/vhd/lib/libvhd-journal.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/libvhd-journal.c
@@ -0,0 +1,1540 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "atomicio.h"
+#include "libvhd-journal.h"
+
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P  1
+#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C  2
+#define VHD_JOURNAL_ENTRY_TYPE_HEADER    3
+#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR   4
+#define VHD_JOURNAL_ENTRY_TYPE_BAT       5
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H  6
+#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M  7
+#define VHD_JOURNAL_ENTRY_TYPE_DATA      8
+
+typedef struct vhd_journal_entry {
+       uint64_t                         cookie;
+       uint32_t                         type;
+       uint32_t                         size;
+       uint64_t                         offset;
+       uint32_t                         checksum;
+} vhd_journal_entry_t;
+
+static inline int
+vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence)
+{
+       off64_t off;
+
+       off = lseek64(j->jfd, offset, whence);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       return 0;
+}
+
+static inline off64_t
+vhd_journal_position(vhd_journal_t *j)
+{
+       return lseek64(j->jfd, 0, SEEK_CUR);
+}
+
+static inline int
+vhd_journal_read(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(read, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_write(vhd_journal_t *j, void *buf, size_t size)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = atomicio(vwrite, j->jfd, buf, size);
+       if (ret != size)
+               return (errno ? -errno : -EIO);
+
+       return 0;
+}
+
+static inline int
+vhd_journal_truncate(vhd_journal_t *j, off64_t length)
+{
+       int err;
+
+       err = ftruncate(j->jfd, length);
+       if (err == -1)
+               return -errno;
+
+       return 0;
+}
+
+static inline int
+vhd_journal_sync(vhd_journal_t *j)
+{
+       int err;
+
+       err = fdatasync(j->jfd);
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+static inline void
+vhd_journal_header_in(vhd_journal_header_t *header)
+{
+       BE64_IN(&header->vhd_footer_offset);
+       BE32_IN(&header->journal_data_entries);
+       BE32_IN(&header->journal_metadata_entries);
+       BE64_IN(&header->journal_data_offset);
+       BE64_IN(&header->journal_metadata_offset);
+}
+
+static inline void
+vhd_journal_header_out(vhd_journal_header_t *header)
+{
+       BE64_OUT(&header->vhd_footer_offset);
+       BE32_OUT(&header->journal_data_entries);
+       BE32_OUT(&header->journal_metadata_entries);
+       BE64_OUT(&header->journal_data_offset);
+       BE64_OUT(&header->journal_metadata_offset);
+}
+
+static int
+vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       off64_t eof;
+
+       if (memcmp(header->cookie,
+                  VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie)))
+               return -EINVAL;
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       eof = vhd_journal_position(j);
+       if (eof == (off64_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       if (j->header.journal_metadata_offset > j->header.journal_eof)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+
+       size = sizeof(vhd_journal_header_t);
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read(j, header, size);
+       if (err)
+               return err;
+
+       vhd_journal_header_in(header);
+
+       return vhd_journal_validate_header(j, header);
+}
+
+static int
+vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header)
+{
+       int err;
+       size_t size;
+       vhd_journal_header_t h;
+
+       memcpy(&h, header, sizeof(vhd_journal_header_t));
+
+       err = vhd_journal_validate_header(j, &h);
+       if (err)
+               return err;
+
+       vhd_journal_header_out(&h);
+       size = sizeof(vhd_journal_header_t);
+
+       err  = vhd_journal_seek(j, 0, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write(j, &h, size);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_add_journal_header(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       memset(&j->header, 0, sizeof(vhd_journal_header_t));
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       uuid_copy(j->header.uuid, vhd->footer.uuid);
+       memcpy(j->header.cookie,
+              VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie));
+       j->header.vhd_footer_offset = off - sizeof(vhd_footer_t);
+       j->header.journal_eof = sizeof(vhd_journal_header_t);
+
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static void
+vhd_journal_entry_in(vhd_journal_entry_t *entry)
+{
+       BE32_IN(&entry->type);
+       BE32_IN(&entry->size);
+       BE64_IN(&entry->offset);
+       BE64_IN(&entry->cookie);
+       BE32_IN(&entry->checksum);
+}
+
+static void
+vhd_journal_entry_out(vhd_journal_entry_t *entry)
+{
+       BE32_OUT(&entry->type);
+       BE32_OUT(&entry->size);
+       BE64_OUT(&entry->offset);
+       BE64_OUT(&entry->cookie);
+       BE32_OUT(&entry->checksum);
+}
+
+static uint32_t
+vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size)
+{
+       int i;
+       unsigned char *blob;
+       uint32_t checksum, tmp;
+
+       checksum        = 0;
+       tmp             = entry->checksum;
+       entry->checksum = 0;
+
+       blob = (unsigned char *)entry;
+       for (i = 0; i < sizeof(vhd_journal_entry_t); i++)
+               checksum += blob[i];
+
+       blob = (unsigned char *)buf;
+       for (i = 0; i < size; i++)
+               checksum += blob[i];
+
+       entry->checksum = tmp;
+       return ~checksum;
+}
+
+static int
+vhd_journal_validate_entry(vhd_journal_entry_t *entry)
+{
+       if (entry->size == 0)
+               return -EINVAL;
+
+       if (entry->size & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int
+vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+
+       err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t));
+       if (err)
+               return err;
+
+       vhd_journal_entry_in(entry);
+       return vhd_journal_validate_entry(entry);
+}
+
+static int
+vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry)
+{
+       int err;
+       vhd_journal_entry_t e;
+
+       err = vhd_journal_validate_entry(entry);
+       if (err)
+               return err;
+
+       memcpy(&e, entry, sizeof(vhd_journal_entry_t));
+       vhd_journal_entry_out(&e);
+
+       err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t));
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf)
+{
+       int err;
+       uint32_t checksum;
+
+       err      = 0;
+       checksum = vhd_journal_checksum_entry(entry, buf, entry->size);
+
+       if (checksum != entry->checksum)
+               return -EINVAL;
+
+       return err;
+}
+
+static int
+vhd_journal_update(vhd_journal_t *j, off64_t offset,
+                  char *buf, size_t size, uint32_t type)
+{
+       int err;
+       uint64_t *off, off_bak;
+       uint32_t *entries;
+       vhd_journal_entry_t entry;
+
+       entry.type     = type;
+       entry.size     = size;
+       entry.offset   = offset;
+       entry.cookie   = VHD_JOURNAL_ENTRY_COOKIE;
+       entry.checksum = vhd_journal_checksum_entry(&entry, buf, size);
+
+       err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET);
+       if (err)
+               return err;
+
+       err = vhd_journal_write_entry(j, &entry);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_write(j, buf, size);
+       if (err)
+               goto fail;
+
+       if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) {
+               off     = &j->header.journal_data_offset;
+               entries = &j->header.journal_data_entries;
+       } else {
+               off     = &j->header.journal_metadata_offset;
+               entries = &j->header.journal_metadata_entries;
+       }
+
+       off_bak = *off;
+       if (!(*entries)++)
+               *off = j->header.journal_eof;
+       j->header.journal_eof += (size + sizeof(vhd_journal_entry_t));
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err) {
+               if (!--(*entries))
+                       *off = off_bak;
+               j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t));
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       if (!j->is_block)
+               vhd_journal_truncate(j, j->header.journal_eof);
+       return err;
+}
+
+static int
+vhd_journal_add_footer(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+       vhd_footer_t footer;
+
+       vhd = &j->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               return err;
+
+       off = vhd_position(vhd);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t));
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, off - sizeof(vhd_footer_t),
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_read_footer_at(vhd, &footer, 0);
+       if (err)
+               return err;
+
+       vhd_footer_out(&footer);
+       err = vhd_journal_update(j, 0,
+                                (char *)&footer,
+                                sizeof(vhd_footer_t),
+                                VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+
+       return err;
+}
+
+static int
+vhd_journal_add_header(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+       vhd_header_t header;
+
+       vhd = &j->vhd;
+
+       err = vhd_read_header(vhd, &header);
+       if (err)
+               return err;
+
+       off = vhd->footer.data_offset;
+
+       vhd_header_out(&header);
+       err = vhd_journal_update(j, off,
+                                (char *)&header,
+                                sizeof(vhd_header_t),
+                                VHD_JOURNAL_ENTRY_TYPE_HEADER);
+
+       return err;
+}
+
+static int
+vhd_journal_add_locators(vhd_journal_t *j)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+       for (i = 0; i < n; i++) {
+               void *buf;
+               off64_t off;
+               size_t size;
+               vhd_parent_locator_t *loc;
+
+               loc  = vhd->header.loc + i;
+               err  = vhd_validate_platform_code(loc->code);
+               if (err)
+                       return err;
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               off  = loc->data_offset;
+               size = vhd_parent_locator_size(loc);
+
+               err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+               if (err)
+                       return -err;
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto end;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_LOCATOR);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               if (err)
+                       break;
+       }
+
+       return err;
+}
+
+static int
+vhd_journal_add_bat(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       size_t size;
+       vhd_bat_t bat;
+       vhd_context_t *vhd;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err  = vhd_read_bat(vhd, &bat);
+       if (err)
+               return err;
+
+       off  = vhd->header.table_offset;
+       size = vhd_bytes_padded(bat.entries * sizeof(uint32_t));
+
+       vhd_bat_out(&bat);
+       err  = vhd_journal_update(j, off, (char *)bat.bat, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BAT);
+
+       free(bat.bat);
+       return err;
+}
+
+static int
+vhd_journal_add_batmap(vhd_journal_t *j)
+{
+       int err;
+       off64_t off;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_batmap_t batmap;
+
+       vhd  = &j->vhd;
+
+       err  = vhd_batmap_header_offset(vhd, &off);
+       if (err)
+               return err;
+
+       err  = vhd_read_batmap(vhd, &batmap);
+       if (err)
+               return err;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       vhd_batmap_header_out(&batmap);
+       err  = vhd_journal_update(j, off, (char *)&batmap.header, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_H);
+       if (err)
+               goto out;
+
+       vhd_batmap_header_in(&batmap);
+       off  = batmap.header.batmap_offset;
+       size = vhd_sectors_to_bytes(batmap.header.batmap_size);
+
+       err  = vhd_journal_update(j, off, batmap.map, size,
+                                 VHD_JOURNAL_ENTRY_TYPE_BATMAP_M);
+
+out:
+       free(batmap.map);
+       return err;
+}
+
+static int
+vhd_journal_add_metadata(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_journal_add_footer(j);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       err = vhd_journal_add_header(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_locators(j);
+       if (err)
+               return err;
+
+       err = vhd_journal_add_bat(j);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_journal_add_batmap(j);
+               if (err)
+                       return err;
+       }
+
+       j->header.journal_data_offset = j->header.journal_eof;
+       return vhd_journal_write_header(j, &j->header);
+}
+
+static int
+__vhd_journal_read_footer(vhd_journal_t *j,
+                         vhd_footer_t *footer, uint32_t type)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != type)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, footer, entry.size);
+       if (err)
+               return err;
+
+       vhd_footer_in(footer);
+       return vhd_validate_footer(footer);
+}
+
+static int
+vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_P);
+}
+
+static int
+vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return __vhd_journal_read_footer(j, footer,
+                                        VHD_JOURNAL_ENTRY_TYPE_FOOTER_C);
+}
+
+static int
+vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       int err;
+       vhd_journal_entry_t entry;
+
+       err = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER)
+               return -EINVAL;
+
+       if (entry.size != sizeof(vhd_header_t))
+               return -EINVAL;
+
+       err = vhd_journal_read(j, header, entry.size);
+       if (err)
+               return err;
+
+       vhd_header_in(header);
+       return vhd_validate_header(header);
+}
+
+static int
+vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs)
+{
+       int err, n, _locs;
+       char **_locators;
+       void *buf;
+       off_t pos;
+       vhd_journal_entry_t entry;
+
+       _locs     = 0;
+       *locs     = 0;
+       *locators = NULL;
+
+       n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t);
+       _locators = calloc(n, sizeof(char *));
+       if (!_locators)
+               return -ENOMEM;
+
+       for (;;) {
+               buf = NULL;
+
+               pos = vhd_journal_position(j);
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto fail;
+
+               if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) {
+                       err = vhd_journal_seek(j, pos, SEEK_SET);
+                       if (err)
+                               goto fail;
+                       break;
+               }
+
+               if (_locs >= n) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+
+               err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto fail;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto fail;
+
+               _locators[_locs++] = buf;
+               err                = 0;
+       }
+
+
+       *locs     = _locs;
+       *locators = _locators;
+
+       return 0;
+
+fail:
+       if (_locators) {
+               for (n = 0; n < _locs; n++)
+                       free(_locators[n]);
+               free(_locators);
+       }
+       return err;
+}
+
+static int
+vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       int err;
+       size_t size;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+       void *_bat;
+
+       vhd  = &j->vhd;
+
+       size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       if (entry.offset != vhd->header.table_offset)
+               return -EINVAL;
+
+       err = posix_memalign(&_bat, VHD_SECTOR_SIZE, size);
+       if (err)
+               return -err;
+       bat->bat = _bat;
+
+       err = vhd_journal_read(j, bat->bat, entry.size);
+       if (err)
+               goto fail;
+
+       bat->spb     = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+       bat->entries = vhd->header.max_bat_size;
+       vhd_bat_in(bat);
+
+       return 0;
+
+fail:
+       free(bat->bat);
+       bat->bat = NULL;
+       return err;
+}
+
+static int
+vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       void *buf;
+       size_t size;
+       vhd_journal_entry_t entry;
+
+       size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H)
+               return -EINVAL;
+
+       if (entry.size != size)
+               return -EINVAL;
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+       if (err)
+               return err;
+
+       err = vhd_journal_read(j, buf, entry.size);
+       if (err) {
+               free(buf);
+               return err;
+       }
+
+       memcpy(&batmap->header, buf, sizeof(batmap->header));
+
+       vhd_batmap_header_in(batmap);
+       return vhd_validate_batmap_header(batmap);
+}
+
+static int
+vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+       vhd_journal_entry_t entry;
+       void *map;
+
+       err  = vhd_journal_read_entry(j, &entry);
+       if (err)
+               return err;
+
+       if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M)
+               return -EINVAL;
+
+       if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size))
+               return -EINVAL;
+
+       if (entry.offset != batmap->header.batmap_offset)
+               return -EINVAL;
+
+       err = posix_memalign(&map, VHD_SECTOR_SIZE, entry.size);
+       if (err)
+               return -err;
+
+       batmap->map = map;
+
+       err = vhd_journal_read(j, batmap->map, entry.size);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       int err;
+
+       err = vhd_journal_read_batmap_header(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_journal_read_batmap_map(j, batmap);
+       if (err)
+               return err;
+
+       err = vhd_validate_batmap(&j->vhd, batmap);
+       if (err) {
+               free(batmap->map);
+               batmap->map = NULL;
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer,
+                                  j->header.vhd_footer_offset);
+}
+
+static int
+vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer)
+{
+       return vhd_write_footer_at(&j->vhd, footer, 0);
+}
+
+static int
+vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header)
+{
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+       off = vhd->footer.data_offset;
+
+       return vhd_write_header_at(&j->vhd, header, off);
+}
+
+static int
+vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs)
+{
+       size_t size;
+       vhd_context_t *vhd;
+       int i, n, lidx, err;
+       vhd_parent_locator_t *loc;
+
+       lidx = 0;
+       vhd  = &j->vhd;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       for (i = 0; i < n && lidx < locs; i++) {
+               loc  = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       return err;
+
+               size = vhd_parent_locator_size(loc);
+               err  = vhd_write(vhd, locators[lidx++], size);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+static int
+vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat)
+{
+       return vhd_write_bat(&j->vhd, bat);
+}
+
+static int
+vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap)
+{
+       return vhd_write_batmap(&j->vhd, batmap);
+}
+
+static int
+vhd_journal_restore_metadata(vhd_journal_t *j)
+{
+       off64_t off;
+       char **locators;
+       vhd_footer_t copy;
+       vhd_context_t *vhd;
+       int i, locs, hlocs, err;
+
+       vhd      = &j->vhd;
+       locs     = 0;
+       hlocs    = 0;
+       locators = NULL;
+
+       err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_footer(j, &vhd->footer);
+       if (err)
+               return err;
+
+       if (!vhd_type_dynamic(vhd))
+               goto restore;
+
+       err  = vhd_journal_read_footer_copy(j, &copy);
+       if (err)
+               return err;
+
+       err  = vhd_journal_read_header(j, &vhd->header);
+       if (err)
+               return err;
+
+       for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) {
+               if (vhd_validate_platform_code(vhd->header.loc[i].code))
+                       return err;
+
+               if (vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       hlocs++;
+       }
+
+       if (hlocs) {
+               err  = vhd_journal_read_locators(j, &locators, &locs);
+               if (err)
+                       return err;
+
+               if (hlocs != locs) {
+                       err = -EINVAL;
+                       goto out;
+               }
+       }
+
+       err  = vhd_journal_read_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_read_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+restore:
+       off  = vhd_journal_position(j);
+       if (off == (off64_t)-1)
+               return -errno;
+
+       if (j->header.journal_data_offset != off)
+               return -EINVAL;
+
+       err  = vhd_journal_restore_footer(j, &vhd->footer);
+       if (err)
+               goto out;
+
+       if (!vhd_type_dynamic(vhd))
+               goto out;
+
+       err  = vhd_journal_restore_footer_copy(j, &copy);
+       if (err)
+               goto out;
+
+       err  = vhd_journal_restore_header(j, &vhd->header);
+       if (err)
+               goto out;
+
+       if (locs) {
+               err = vhd_journal_restore_locators(j, locators, locs);
+               if (err)
+                       goto out;
+       }
+
+       err  = vhd_journal_restore_bat(j, &vhd->bat);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(vhd)) {
+               err  = vhd_journal_restore_batmap(j, &vhd->batmap);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       if (locators) {
+               for (i = 0; i < locs; i++)
+                       free(locators[i]);
+               free(locators);
+       }
+
+       if (!err && !vhd->is_block)
+               err = ftruncate(vhd->fd,
+                         j->header.vhd_footer_offset +
+                         sizeof(vhd_footer_t));
+
+       return err;
+}
+
+static int
+vhd_journal_disable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       memcpy(&vhd->footer.cookie,
+              VHD_POISON_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_journal_enable_vhd(vhd_journal_t *j)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       vhd = &j->vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               return err;
+
+       if (!vhd_disabled(vhd))
+               return 0;
+
+       memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie));
+       vhd->footer.checksum = vhd_checksum_footer(&vhd->footer);
+
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+int
+vhd_journal_close(vhd_journal_t *j)
+{
+       if (j->jfd)
+               close(j->jfd);
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_remove(vhd_journal_t *j)
+{
+       int err;
+
+       err = vhd_journal_enable_vhd(j);
+       if (err)
+               return err;
+
+       if (j->jfd) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+
+       vhd_close(&j->vhd);
+       free(j->jname);
+
+       return 0;
+}
+
+int
+vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       int err;
+       vhd_context_t *vhd;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       j->jfd = -1;
+       vhd    = &j->vhd;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL)
+               return -ENOMEM;
+
+       j->jfd = open(j->jname, O_LARGEFILE | O_RDWR);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(j->jname, &j->is_block);
+       if (err)
+               goto fail;
+
+       vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT);
+       if (vhd->fd == -1) {
+               err = -errno;
+               goto fail;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_read_journal_header(j, &j->header);
+       if (err)
+               goto fail;
+
+       err = vhd_journal_restore_metadata(j);
+       if (err)
+               goto fail;
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       if (err)
+               goto fail;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               goto fail;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       goto fail;
+       }
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail;
+
+       return 0;
+
+fail:
+       vhd_journal_close(j);
+       return err;
+}
+
+int
+vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile)
+{
+       int err;
+
+       memset(j, 0, sizeof(vhd_journal_t));
+       j->jfd = -1;
+
+       j->jname = strdup(jfile);
+       if (j->jname == NULL) {
+               err = -ENOMEM;
+               goto fail1;
+       }
+
+       if (access(j->jname, F_OK) == 0) {
+               err = vhd_test_file_fixed(j->jname, &j->is_block);
+               if (err)
+                       goto fail1;
+
+               if (!j->is_block) {
+                       err = -EEXIST;
+                       goto fail1;
+               }
+       }
+
+       if (j->is_block)
+               j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644);
+       else
+               j->jfd = open(j->jname,
+                             O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644);
+       if (j->jfd == -1) {
+               err = -errno;
+               goto fail1;
+       }
+
+       err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT);
+       if (err)
+               goto fail1;
+
+       err = vhd_get_bat(&j->vhd);
+       if (err)
+               goto fail2;
+
+       if (vhd_has_batmap(&j->vhd)) {
+               err = vhd_get_batmap(&j->vhd);
+               if (err)
+                       goto fail2;
+       }
+
+       err = vhd_journal_add_journal_header(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_add_metadata(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_disable_vhd(j);
+       if (err)
+               goto fail2;
+
+       err = vhd_journal_sync(j);
+       if (err)
+               goto fail2;
+
+       return 0;
+
+fail1:
+       if (j->jfd != -1) {
+               close(j->jfd);
+               if (!j->is_block)
+                       unlink(j->jname);
+       }
+       free(j->jname);
+       memset(j, 0, sizeof(vhd_journal_t));
+
+       return err;
+
+fail2:
+       vhd_journal_remove(j);
+       return err;
+}
+
+int
+vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode)
+{
+       int err;
+       char *buf;
+       off64_t off;
+       size_t size;
+       uint64_t blk;
+       vhd_context_t *vhd;
+
+       buf = NULL;
+       vhd = &j->vhd;
+
+       if (!vhd_type_dynamic(vhd))
+               return -EINVAL;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (block >= vhd->bat.entries)
+               return -ERANGE;
+
+       blk = vhd->bat.bat[block];
+       if (blk == DD_BLK_UNUSED)
+               return 0;
+
+       off = vhd_sectors_to_bytes(blk);
+
+       if (mode & VHD_JOURNAL_METADATA) {
+               size = vhd_sectors_to_bytes(vhd->bm_secs);
+
+               err  = vhd_read_bitmap(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       if (mode & VHD_JOURNAL_DATA) {
+               off += vhd_sectors_to_bytes(vhd->bm_secs);
+               size = vhd_sectors_to_bytes(vhd->spb);
+
+               err  = vhd_read_block(vhd, block, &buf);
+               if (err)
+                       return err;
+
+               err  = vhd_journal_update(j, off, buf, size,
+                                         VHD_JOURNAL_ENTRY_TYPE_DATA);
+               free(buf);
+
+               if (err)
+                       return err;
+       }
+
+       return vhd_journal_sync(j);
+}
+
+/*
+ * commit indicates the transaction completed 
+ * successfully and we can remove the undo log
+ */
+int
+vhd_journal_commit(vhd_journal_t *j)
+{
+       int err;
+
+       j->header.journal_data_entries     = 0;
+       j->header.journal_metadata_entries = 0;
+       j->header.journal_data_offset      = 0;
+       j->header.journal_metadata_offset  = 0;
+
+       err = vhd_journal_write_header(j, &j->header);
+       if (err)
+               return err;
+
+       if (!j->is_block)
+               err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t));
+       if (err)
+               return -errno;
+
+       return 0;
+}
+
+/*
+ * revert indicates the transaction failed
+ * and we should revert any changes via the undo log
+ */
+int
+vhd_journal_revert(vhd_journal_t *j)
+{
+       int i, err;
+       char *file;
+       void *buf;
+       vhd_context_t *vhd;
+       vhd_journal_entry_t entry;
+
+       err  = 0;
+       vhd  = &j->vhd;
+       buf  = NULL;
+
+       file = strdup(vhd->file);
+       if (!file)
+               return -ENOMEM;
+
+       vhd_close(&j->vhd);
+       j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE);
+       if (j->vhd.fd == -1) {
+               free(file);
+               return -errno;
+       }
+
+       err = vhd_test_file_fixed(file, &vhd->is_block);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       err  = vhd_journal_restore_metadata(j);
+       if (err) {
+               free(file);
+               return err;
+       }
+
+       close(vhd->fd);
+       free(vhd->bat.bat);
+       free(vhd->batmap.map);
+
+       err = vhd_open(vhd, file, VHD_OPEN_RDWR);
+       free(file);
+       if (err)
+               return err;
+
+       err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET);
+       if (err)
+               return err;
+
+       for (i = 0; i < j->header.journal_data_entries; i++) {
+               err = vhd_journal_read_entry(j, &entry);
+               if (err)
+                       goto end;
+
+               err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto end;
+               }
+
+               err = vhd_journal_read(j, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = vhd_journal_validate_entry_data(&entry, buf);
+               if (err)
+                       goto end;
+
+               err = vhd_seek(vhd, entry.offset, SEEK_SET);
+               if (err)
+                       goto end;
+
+               err = vhd_write(vhd, buf, entry.size);
+               if (err)
+                       goto end;
+
+               err = 0;
+
+       end:
+               free(buf);
+               buf = NULL;
+               if (err)
+                       break;
+       }
+
+       if (err)
+               return err;
+
+       if (!vhd->is_block) {
+               err = ftruncate(vhd->fd, j->header.vhd_footer_offset +
+                               sizeof(vhd_footer_t));
+               if (err)
+                       return -errno;
+       }
+
+       return vhd_journal_sync(j);
+}
diff --git a/tools/blktap3/vhd/lib/libvhd.c b/tools/blktap3/vhd/lib/libvhd.c
--- a/tools/blktap3/vhd/lib/libvhd.c
+++ b/tools/blktap3/vhd/lib/libvhd.c
@@ -44,6 +44,10 @@
 #include "libvhd.h"
 #include "relative-path.h"
 
+/* VHD uses an epoch of 12:00AM, Jan 1, 2000. This is the Unix timestamp for 
+ * the start of the VHD epoch. */
+#define VHD_EPOCH_START 946684800
+
 #define VHD_HEADER_MAX_RETRIES 10
 
 static int libvhd_dbg = 0;
@@ -698,19 +702,10 @@ vhd_end_of_data(vhd_context_t *ctx, off6
        return 0;
 }
 
-uint32_t
+inline uint32_t
 vhd_time(time_t time)
 {
-       struct tm tm;
-       time_t micro_epoch;
-
-       memset(&tm, 0, sizeof(struct tm));
-       tm.tm_year   = 100;
-       tm.tm_mon    = 0;
-       tm.tm_mday   = 1;
-       micro_epoch  = mktime(&tm);
-
-       return (uint32_t)(time - micro_epoch);
+       return (uint32_t)(time - VHD_EPOCH_START);
 }
 
 /* 
@@ -721,20 +716,10 @@ size_t
 vhd_time_to_string(uint32_t timestamp, char *target)
 {
        char *cr;
-       struct tm tm;
-       time_t t1, t2;
-
-       memset(&tm, 0, sizeof(struct tm));
-
-       /* VHD uses an epoch of 12:00AM, Jan 1, 2000.         */
-       /* Need to adjust this to the expected epoch of 1970. */
-       tm.tm_year  = 100;
-       tm.tm_mon   = 0;
-       tm.tm_mday  = 1;
-
-       t1 = mktime(&tm);
-       t2 = t1 + (time_t)timestamp;
-       ctime_r(&t2, target);
+       time_t unix_timestamp;
+
+       unix_timestamp = (time_t)timestamp + VHD_EPOCH_START;
+       ctime_r(&unix_timestamp, target);
 
        /* handle mad ctime_r newline appending. */
        if ((cr = strchr(target, '\n')) != NULL)
@@ -2808,6 +2793,11 @@ vhd_change_parent(vhd_context_t *child, 
        vhd_context_t parent;
        char __parent_path[PATH_MAX];
 
+       if (child->footer.type != HD_TYPE_DIFF) {
+               VHDLOG("would-be child is not a differencing disk\n");
+               return -EINVAL;
+       }
+
        ppath = realpath(parent_path, __parent_path);
        if (!ppath) {
                VHDLOG("error resolving parent path %s for %s: %d\n",
@@ -3225,22 +3215,29 @@ static int
 {
        off64_t off;
        uint32_t blk, sec;
-       int err, cnt, map_off;
+       int err, cnt, map_off, i;
        char *bitmap, *data, *src;
 
        map_off = 0;
 
        do {
+               data   = NULL;
+               bitmap = NULL;
+               if (sector >= ctx->footer.curr_size >> VHD_SECTOR_SHIFT) {
+                       cnt = secs;
+                       for (i = 0; i < cnt; i++)
+                               set_bit(map, map_off + i);
+                       /* buf has already been zeroed out */
+                       goto next;
+               }
+
                blk    = sector / ctx->spb;
                sec    = sector % ctx->spb;
-               off    = ctx->bat.bat[blk];
-               data   = NULL;
-               bitmap = NULL;
-
-               if (off == DD_BLK_UNUSED) {
-                       cnt = MIN(secs, ctx->spb);
+               cnt = MIN(secs, ctx->spb - sec);
+               off = ctx->bat.bat[blk];
+
+               if (off == DD_BLK_UNUSED)
                        goto next;
-               }
 
                err = vhd_read_bitmap(ctx, blk, &bitmap);
                if (err)
@@ -3252,7 +3249,6 @@ static int
                        return err;
                }
 
-               cnt = MIN(secs, ctx->spb - sec);
                src = data + vhd_sectors_to_bytes(sec);
 
                __vhd_io_dynamic_copy_data(ctx,
diff --git a/tools/blktap3/vhd/lib/libvhdio.c b/tools/blktap3/vhd/lib/libvhdio.c
--- a/tools/blktap3/vhd/lib/libvhdio.c
+++ b/tools/blktap3/vhd/lib/libvhdio.c
@@ -1381,17 +1381,25 @@ int __lxstat64(int version, const char *
     return ret;
 }
 
-int ioctl(int fd, int request, char *argp)
+#ifdef __x86_64__
+#define IOCTL_REQUEST long long
+#define IOCTL_REQUEST_FMT "%Lx"
+#else
+#define IOCTL_REQUEST int
+#define IOCTL_REQUEST_FMT "%x"
+#endif
+
+int ioctl(int fd, IOCTL_REQUEST request, char *argp)
 {
     vhd_fd_context_t *vhd_fd;
-    static int (*_std_ioctl) (int, int, char *);
+    static int (*_std_ioctl) (int, IOCTL_REQUEST, char *);
 
     _RESOLVE(_std_ioctl);
     vhd_fd = _libvhd_io_map_get(fd);
     if (!vhd_fd)
         return _std_ioctl(fd, request, argp);
 
-    LOG("%s 0x%x 0x%x %p\n", __func__, fd, request, argp);
+    LOG("%s 0x%x 0x" IOCTL_REQUEST_FMT " %p\n", __func__, fd, request, argp);
 
 #ifdef BLKGETSIZE64
     if (request == BLKGETSIZE64) {
diff --git a/tools/blktap3/vhd/lib/test/random-copy.c 
b/tools/blktap3/vhd/lib/test/random-copy.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/test/random-copy.c
@@ -0,0 +1,226 @@
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+struct range {
+       off64_t         start;
+       off64_t         end;
+};
+
+struct random_copy_ctx {
+       int             sfd;
+       int             dfd;
+       int             total_chunks;
+       struct range   *chunks;
+};
+
+static void
+usage(const char *app, int err)
+{
+       printf("usage: %s <src> <dst>\n", app);
+       exit(err);
+}
+
+static int
+random_copy_carve_source(struct random_copy_ctx *ctx)
+{
+       int err, i, n;
+       struct stat64 st;
+       off64_t bytes, start;
+
+       err = fstat64(ctx->sfd, &st);
+       if (err) {
+               perror("stat source");
+               return errno;
+       }
+
+       n     = 100;
+       start = 0;
+       bytes = st.st_size;
+
+       ctx->chunks = calloc(n, sizeof(struct range));
+       if (!ctx->chunks) {
+               printf("calloc failed\n");
+               return ENOMEM;
+       }
+
+       for (i = 0; start < st.st_size; i++) {
+               int chunk;
+               off64_t end;
+
+               if (i == n) {
+                       struct range *new;
+
+                       n  *= 2;
+                       new = realloc(ctx->chunks, n * sizeof(struct range));
+                       if (!new) {
+                               free(ctx->chunks);
+                               ctx->chunks = NULL;
+                               printf("realloc failed\n");
+                               return ENOMEM;
+                       }
+
+                       ctx->chunks = new;
+               }
+
+               chunk = (random() % (st.st_size / 10)) + 1;
+               end = start + chunk;
+               if (end >= st.st_size)
+                       end = st.st_size - 1;
+
+               ctx->chunks[i].start = start;
+               ctx->chunks[i].end   = end;
+
+               bytes -= (end - start);
+               start  = end + 1;
+       }
+
+       ctx->total_chunks = i;
+
+       return 0;
+}
+
+static int
+random_copy_permute_source(struct random_copy_ctx *ctx)
+{
+       int i;
+
+       for (i = 0; i < ctx->total_chunks; i++) {
+               int idx          = random() % ctx->total_chunks;
+               struct range tmp = ctx->chunks[idx];
+               ctx->chunks[idx] = ctx->chunks[i];
+               ctx->chunks[i]   = tmp;
+       }
+
+       return 0;
+}
+
+static int
+random_copy_init(struct random_copy_ctx *ctx, const char *src, const char *dst)
+{
+       int err;
+
+       memset(ctx, 0, sizeof(*ctx));
+       ctx->sfd = ctx->dfd = -1;
+
+       ctx->sfd = open(src, O_LARGEFILE | O_RDONLY);
+       if (ctx->sfd == -1) {
+               err = errno;
+               perror("opening source");
+               goto fail;
+       }
+
+       ctx->dfd = open(dst, O_LARGEFILE | O_WRONLY);
+       if (ctx->dfd == -1) {
+               err = errno;
+               perror("opening destination");
+               goto fail;
+       }
+
+       err = random_copy_carve_source(ctx);
+       if (err) {
+               printf("failed to carve source: %d\n", err);
+               goto fail;
+       }
+
+       err = random_copy_permute_source(ctx);
+       if (err) {
+               printf("failed to permute source: %d\n", err);
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       close(ctx->sfd);
+       close(ctx->dfd);
+       memset(ctx, 0, sizeof(*ctx));
+       return err;
+}
+
+static int
+random_copy(struct random_copy_ctx *ctx)
+{
+       char *buf;
+       int i, err;
+
+       for (i = 0; i < ctx->total_chunks; i++) {
+               struct range *r = &ctx->chunks[i];
+               size_t count    = r->end - r->start + 1;
+
+               buf = calloc(1, count);
+               if (!buf) {
+                       printf("calloc failed\n");
+                       return ENOMEM;
+               }
+
+               fprintf(stderr, "copying 0x%zx from 0x%"PRIx64"\n",
+                       count, r->start);
+
+               err = pread(ctx->sfd, buf, count, r->start);
+               if (err != count) {
+                       printf("pread(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n",
+                              count, r->start, err, errno);
+                       free(buf);
+                       return (errno ? : EIO);
+               }
+
+               err = pwrite(ctx->dfd, buf, count, r->start);
+               if (err != count) {
+                       printf("pwrite(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n",
+                              count, r->start, err, errno);
+                       free(buf);
+                       return (errno ? : EIO);
+               }
+
+               free(buf);
+       }
+
+       return 0;
+}
+
+static void
+random_copy_close(struct random_copy_ctx *ctx)
+{
+       close(ctx->sfd);
+       close(ctx->dfd);
+       free(ctx->chunks);
+}
+
+int
+main(int argc, char *argv[])
+{
+       int err;
+       char *src, *dst;
+       struct random_copy_ctx ctx;
+
+       if (argc != 3)
+               usage(argv[0], EINVAL);
+
+       src = argv[1];
+       dst = argv[2];
+
+       err = random_copy_init(&ctx, src, dst);
+       if (err) {
+               printf("failed to init: %d\n", err);
+               exit(err);
+       }
+
+       err = random_copy(&ctx);
+       if (err)
+               printf("copy failed: %d\n", err);
+
+       random_copy_close(&ctx);
+
+       return err;
+}
diff --git a/tools/blktap3/vhd/lib/test/test-snapshot.c 
b/tools/blktap3/vhd/lib/test/test-snapshot.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/test/test-snapshot.c
@@ -0,0 +1,161 @@
+/*
+ * libvhdio.so supports a simple test hook for validating vhd chains:
+ * if LIBVHD_IO_TEST is set, libvhdio will handle SIGCONT specially
+ * by closing, snapshotting, and reopening any vhds it is tracking.
+ *
+ * this harness simply forks a test and stops/continues it at a given interval.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+static void
+usage(const char *app, int err)
+{
+       printf("usage: %s <-i interval> -- <app and args>\n", app);
+       exit(err);
+}
+
+static void
+sighandler(int sig)
+{
+       fprintf(stderr, "child exited\n");
+       exit(0);
+}
+
+static void
+stop(pid_t pid)
+{
+       int status;
+
+       fprintf(stderr, "stopping %d\n", pid);
+
+       if (kill(pid, SIGSTOP)) {
+               perror("stop child");
+               exit(1);
+       }
+
+       if (waitpid(pid, &status, WUNTRACED) == -1) {
+               perror("waiting for child to stop");
+               exit(1);
+       }
+
+       if (WIFEXITED(status))
+               exit(0);
+
+       if (!WIFSTOPPED(status)) {
+               perror("child not stopped");
+               exit(1);
+       }
+}
+
+static void
+resume(pid_t pid)
+{
+       int status;
+
+       fprintf(stderr, "resuming %d\n", pid);
+
+       if (kill(pid, SIGCONT)) {
+               perror("resume child");
+               exit(1);
+       }
+
+       if (waitpid(pid, &status, WCONTINUED) == -1) {
+               perror("waiting for child to resume");
+               exit(1);
+       }
+
+       if (WIFEXITED(status))
+               exit(0);
+
+       if (!WIFCONTINUED(status)) {
+               perror("child not resumed");
+               exit(1);
+       }
+}
+
+static void
+test(pid_t pid, int interval)
+{
+       for (;;) {
+               fprintf(stderr, "sleeping\n");
+               sleep(interval);
+               stop(pid);
+               resume(pid);
+       }
+}
+
+int
+main(int argc, char **argv)
+{
+       pid_t pid;
+       sigset_t set;
+       int c, interval;
+       struct sigaction act;
+
+       interval = 0;
+
+       while ((c = getopt(argc, argv, "i:h")) != -1) {
+               switch (c) {
+               case 'i':
+                       interval = atoi(optarg);
+                       break;
+               case 'h':
+                       usage(argv[0], 0);
+                       break;
+               default:
+                       usage(argv[0], EINVAL);
+                       break;
+               }
+       }
+
+       if (optind == argc || !interval)
+               usage(argv[0], EINVAL);
+
+       if (sigemptyset(&set)) {
+               perror("init sigset");
+               exit(1);
+       }
+
+       act = (struct sigaction) {
+               .sa_handler = sighandler,
+               .sa_mask    = set,
+               .sa_flags   = SA_NOCLDSTOP,
+       };
+
+       if (sigaction(SIGCHLD, &act, NULL)) {
+               perror("register sig handler");
+               exit(1);
+       }
+
+       switch ((pid = fork())) {
+       case 0:
+               if (putenv("LIBVHD_IO_TEST=y")) {
+                       perror("setting environment");
+                       exit(errno);
+               }
+
+               execvp(argv[optind], &argv[optind]);
+
+               perror("exec");
+               exit(errno);
+       case -1:
+               perror("fork");
+               exit(errno);
+       default:
+               test(pid, interval);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-check.c 
b/tools/blktap3/vhd/lib/vhd-util-check.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-check.c
@@ -0,0 +1,1272 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+#include <sys/stat.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+// allow the VHD timestamp to be at most this many seconds into the future to 
+// account for time skew with NFS servers
+#define TIMESTAMP_MAX_SLACK 1800
+
+struct vhd_util_check_options {
+       char                             ignore_footer;
+       char                             ignore_parent_uuid;
+       char                             ignore_timestamps;
+       char                             check_data;
+       char                             no_check_bat;
+       char                             collect_stats;
+};
+
+TAILQ_HEAD(tqh_vhd_util_check_stats, vhd_util_check_stats);
+
+struct vhd_util_check_stats {
+       char                            *name;
+       char                            *bitmap;
+       uint64_t                         secs_total;
+       uint64_t                         secs_allocated;
+       uint64_t                         secs_written;
+       TAILQ_ENTRY(vhd_util_check_stats) next;
+};
+
+struct vhd_util_check_ctx {
+       struct vhd_util_check_options    opts;
+       struct tqh_vhd_util_check_stats  stats;
+       int                              primary_footer_missing;
+};
+
+#define ctx_cur_stats(ctx) \
+       TAILQ_FIRST(&(ctx)->stats)
+
+static inline int
+test_bit_u64(volatile char *addr, uint64_t nr)
+{
+       return ((addr[nr >> 3] << (nr & 7)) & 0x80) != 0;
+}
+
+static inline void
+set_bit_u64(volatile char *addr, uint64_t nr)
+{
+       addr[nr >> 3] |= (0x80 >> (nr & 7));
+}
+
+static void
+vhd_util_check_stats_init(struct vhd_util_check_ctx *ctx)
+{
+       memset(&ctx->stats, 0, sizeof(ctx->stats));
+       TAILQ_INIT(&ctx->stats);
+}
+
+static void
+vhd_util_check_stats_free_one(struct vhd_util_check_stats *stats)
+{
+       if (stats) {
+               free(stats->name);
+               free(stats->bitmap);
+               free(stats);
+       }
+}
+
+static int
+vhd_util_check_stats_alloc_one(struct vhd_util_check_ctx *ctx,
+                              vhd_context_t *vhd)
+{
+       int size;
+       struct vhd_util_check_stats *stats;
+
+       stats = calloc(1, sizeof(*stats));
+       if (!stats)
+               goto fail;
+
+       stats->name = strdup(vhd->file);
+       if (!stats->name)
+               goto fail;
+
+       stats->secs_total = (uint64_t)vhd->spb * vhd->header.max_bat_size;
+       size = (stats->secs_total + 7) >> 3;
+       stats->bitmap = calloc(1, size);
+       if (!stats->bitmap)
+               goto fail;
+
+       TAILQ_INSERT_HEAD(&ctx->stats, stats, next);
+
+       return 0;
+
+fail:
+       vhd_util_check_stats_free_one(stats);
+       printf("failed to allocate stats for %s\n", vhd->file);
+       return -ENOMEM;
+}
+
+static void
+vhd_util_check_stats_free(struct vhd_util_check_ctx *ctx)
+{
+       struct vhd_util_check_stats *stats, *tmp;
+
+       TAILQ_FOREACH_SAFE(stats, &ctx->stats, next, tmp) {
+               TAILQ_REMOVE(&ctx->stats, stats, next);
+               vhd_util_check_stats_free_one(stats);
+       }
+}
+
+static inline float
+pct(uint64_t num, uint64_t den)
+{
+       return (!den ? 0.0 : (((float)num / (float)den)) * 100.0);
+}
+
+static inline char *
+name(const char *path)
+{
+       char *p = strrchr(path, '/');
+       if (p && (p - path) == strlen(path))
+               p = strrchr(--p, '/');
+       return (char *)(p ? ++p : path);
+}
+
+static void
+vhd_util_check_stats_print(struct vhd_util_check_ctx *ctx)
+{
+       char *bitmap;
+       uint64_t secs;
+       struct vhd_util_check_stats *head, *cur, *prev;
+
+       if (TAILQ_EMPTY(&ctx->stats))
+               return;
+
+    head = TAILQ_FIRST(&ctx->stats);
+       printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64" 
(%.2f%%)\n",
+              name(head->name), head->secs_allocated, head->secs_written,
+              pct(head->secs_written, head->secs_allocated));
+
+       if (TAILQ_LAST(&ctx->stats, tqh_vhd_util_check_stats) == head)
+               return;
+
+       secs = head->secs_total;
+
+       bitmap = malloc((secs + 7) >> 3);
+       if (!bitmap) {
+               printf("failed to allocate bitmap\n");
+               return;
+       }
+       memcpy(bitmap, head->bitmap, ((secs + 7) >> 3));
+
+       cur = prev = head;
+       while (TAILQ_LAST(&ctx->stats, tqh_vhd_util_check_stats) != cur) {
+               uint64_t i, up = 0, uc = 0;
+
+               cur = TAILQ_NEXT(cur, next);
+
+               for (i = 0; i < secs; i++) {
+                       if (test_bit_u64(cur->bitmap, i)) {
+                               if (!test_bit_u64(prev->bitmap, i))
+                                       up++; /* sector is unique wrt parent */
+
+                               if (!test_bit_u64(bitmap, i))
+                                       uc++; /* sector is unique wrt chain */
+
+                               set_bit_u64(bitmap, i);
+                       }
+               }
+
+               printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64
+                      " (%.2f%%) secs not in parent: 0x%"PRIx64" (%.2f%%)"
+                      " secs not in ancestors: 0x%"PRIx64" (%.2f%%)\n",
+                      name(cur->name), cur->secs_allocated, cur->secs_written,
+                      pct(cur->secs_written, cur->secs_allocated),
+                      up, pct(up, cur->secs_written),
+                      uc, pct(uc, cur->secs_written));
+
+               prev = cur;
+       }
+
+       free(bitmap);
+}
+
+static int
+vhd_util_check_zeros(void *buf, size_t size)
+{
+       int i;
+       char *p;
+
+       p = buf;
+       for (i = 0; i < size; i++)
+               if (p[i])
+                       return i;
+
+       return 0;
+}
+
+static char *
+vhd_util_check_validate_footer(struct vhd_util_check_ctx *ctx,
+                              vhd_footer_t *footer)
+{
+       int size;
+       uint32_t checksum;
+
+       size = sizeof(footer->cookie);
+       if (memcmp(footer->cookie, HD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_footer(footer);
+       if (checksum != footer->checksum) {
+               if (footer->hidden &&
+                   !strncmp(footer->crtr_app, "tap", 3) &&
+                   (footer->crtr_ver == VHD_VERSION(0, 1) ||
+                    footer->crtr_ver == VHD_VERSION(1, 1))) {
+                       char tmp = footer->hidden;
+                       footer->hidden = 0;
+                       checksum = vhd_checksum_footer(footer);
+                       footer->hidden = tmp;
+
+                       if (checksum == footer->checksum)
+                               goto ok;
+               }
+
+               return "invalid checksum";
+       }
+
+ok:
+       if (!(footer->features & HD_RESERVED))
+               return "invalid 'reserved' feature";
+
+       if (footer->features & ~(HD_TEMPORARY | HD_RESERVED))
+               return "invalid extra features";
+
+       if (footer->ff_version != HD_FF_VERSION)
+               return "invalid file format version";
+
+       if (footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF    &&
+           footer->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       if (!ctx->opts.ignore_timestamps) {
+               uint32_t now = vhd_time(time(NULL));
+               if (footer->timestamp > now + TIMESTAMP_MAX_SLACK)
+                       return "creation time in future";
+       }
+
+       if (!strncmp(footer->crtr_app, "tap", 3) &&
+           footer->crtr_ver > VHD_CURRENT_VERSION)
+               return "unsupported tap creator version";
+
+       if (vhd_chs(footer->curr_size) < footer->geometry)
+               return "geometry too large";
+
+       if (footer->type != HD_TYPE_FIXED   &&
+           footer->type != HD_TYPE_DYNAMIC &&
+           footer->type != HD_TYPE_DIFF)
+               return "invalid type";
+
+       if (footer->saved && footer->saved != 1)
+               return "invalid 'saved' state";
+
+       if (footer->hidden && footer->hidden != 1)
+               return "invalid 'hidden' state";
+
+       if (vhd_util_check_zeros(footer->reserved,
+                                sizeof(footer->reserved)))
+               return "invalid 'reserved' bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_header(int fd, vhd_header_t *header)
+{
+       off64_t eof;
+       int i, cnt, size;
+       uint32_t checksum;
+
+       size = sizeof(header->cookie);
+       if (memcmp(header->cookie, DD_COOKIE, size))
+               return "invalid cookie";
+
+       checksum = vhd_checksum_header(header);
+       if (checksum != header->checksum)
+               return "invalid checksum";
+
+       if (header->hdr_ver != 0x00010000)
+               return "invalid header version";
+
+       if (header->data_offset != ~(0ULL))
+               return "invalid data offset";
+
+       eof = lseek64(fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (header->table_offset <= 0  ||
+           header->table_offset % 512 ||
+           (header->table_offset +
+            (header->max_bat_size * sizeof(uint32_t)) >
+            eof - sizeof(vhd_footer_t)))
+               return "invalid table offset";
+
+       for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++)
+               if ((header->block_size >> i) & 1)
+                       cnt++;
+
+       if (cnt != 1)
+               return "invalid block size";
+
+       if (header->res1)
+               return "invalid reserved bits";
+
+       if (vhd_util_check_zeros(header->res2, sizeof(header->res2)))
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_differencing_header(struct vhd_util_check_ctx *ctx,
+                                           vhd_context_t *vhd)
+{
+       vhd_header_t *header;
+
+       header = &vhd->header;
+
+       if (vhd->footer.type == HD_TYPE_DIFF) {
+               char *parent;
+
+               if (!ctx->opts.ignore_timestamps) {
+                       uint32_t now = vhd_time(time(NULL));
+                       if (header->prt_ts > now + TIMESTAMP_MAX_SLACK)
+                               return "parent creation time in future";
+               }
+
+               if (vhd_header_decode_parent(vhd, header, &parent))
+                       return "invalid parent name";
+
+               free(parent);
+       } else {
+               if (vhd_util_check_zeros(header->prt_name,
+                                        sizeof(header->prt_name)))
+                       return "invalid non-null parent name";
+
+               if (vhd_util_check_zeros(header->loc, sizeof(header->loc)))
+                       return "invalid non-null parent locators";
+
+               if (!uuid_is_null(header->prt_uuid))
+                       return "invalid non-null parent uuid";
+
+               if (header->prt_ts)
+                       return "invalid non-zero parent timestamp";
+       }
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap)
+{
+       int size;
+       off64_t eof;
+       uint32_t checksum;
+
+       size = sizeof(batmap->header.cookie);
+       if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size))
+               return "invalid cookie";
+
+       if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION)
+               return "unsupported batmap version";
+
+       checksum = vhd_checksum_batmap(vhd, batmap);
+       if (checksum != batmap->header.checksum)
+               return "invalid checksum";
+
+       if (!batmap->header.batmap_size)
+               return "invalid size zero";
+
+       if (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3) <
+                       vhd->header.max_bat_size)
+               return "batmap-BAT size mismatch";
+
+       eof = lseek64(vhd->fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (!batmap->header.batmap_offset ||
+           batmap->header.batmap_offset % 512)
+               return "invalid batmap offset";
+
+       if ((batmap->header.batmap_offset +
+            vhd_sectors_to_bytes(batmap->header.batmap_size)) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid batmap size";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent_locator(vhd_context_t *vhd,
+                                      vhd_parent_locator_t *loc)
+{
+       off64_t eof;
+
+       if (vhd_validate_platform_code(loc->code))
+               return "invalid platform code";
+
+       if (loc->code == PLAT_CODE_NONE) {
+               if (vhd_util_check_zeros(loc, sizeof(*loc)))
+                       return "non-zero locator";
+
+               return NULL;
+       }
+
+       if (!loc->data_offset)
+               return "invalid data offset";
+
+       if (!loc->data_space)
+               return "invalid data space";
+
+       if (!loc->data_len)
+               return "invalid data length";
+
+       eof = lseek64(vhd->fd, 0, SEEK_END);
+       if (eof == (off64_t)-1)
+               return "error finding eof";
+
+       if (loc->data_offset + vhd_parent_locator_size(loc) >
+           eof - sizeof(vhd_footer_t))
+               return "invalid size";
+
+       if (loc->res)
+               return "invalid reserved bits";
+
+       return NULL;
+}
+
+static char *
+vhd_util_check_validate_parent(struct vhd_util_check_ctx *ctx,
+                              vhd_context_t *vhd, const char *ppath)
+{
+       char *msg;
+       vhd_context_t parent;
+
+       msg = NULL;
+
+       if (vhd_parent_raw(vhd))
+               return msg;
+
+       if (ctx->opts.ignore_parent_uuid)
+               return msg;
+
+       if (vhd_open(&parent, ppath,
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED))
+               return "error opening parent";
+
+       if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) {
+               msg = "invalid parent uuid";
+               goto out;
+       }
+
+out:
+       vhd_close(&parent);
+       return msg;
+}
+
+static int
+vhd_util_check_footer(struct vhd_util_check_ctx *ctx,
+                     int fd, vhd_footer_t *footer)
+{
+       int err;
+       size_t size;
+       char *msg;
+       void *buf;
+       off64_t eof, off;
+       vhd_footer_t primary, backup;
+
+       memset(&primary, 0, sizeof(primary));
+       memset(&backup, 0, sizeof(backup));
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(primary));
+       if (err) {
+               printf("error allocating buffer: %d\n", err);
+               return -err;
+       }
+
+       memset(buf, 0, sizeof(primary));
+
+       eof = lseek64(fd, 0, SEEK_END);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       size = ((eof % 512) ? 511 : 512);
+       eof  = lseek64(fd, eof - size, SEEK_SET);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               printf("error calculating end of file: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, 512);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading primary footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&primary, buf, sizeof(primary));
+       vhd_footer_in(&primary);
+
+       msg = vhd_util_check_validate_footer(ctx, &primary);
+       if (msg) {
+               ctx->primary_footer_missing = 1;
+
+               if (ctx->opts.ignore_footer)
+                       goto check_backup;
+
+               err = -EINVAL;
+               printf("primary footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (primary.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+check_backup:
+       off = lseek64(fd, 0, SEEK_SET);
+       if (off == (off64_t)-1) {
+               err = -errno;
+               printf("error seeking to backup footer: %d\n", err);
+               goto out;
+       }
+
+       size = 512;
+       memset(buf, 0, sizeof(primary));
+
+       err = read(fd, buf, size);
+       if (err != size) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading backup footer: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&backup, buf, sizeof(backup));
+       vhd_footer_in(&backup);
+
+       msg = vhd_util_check_validate_footer(ctx, &backup);
+       if (msg) {
+               err = -EINVAL;
+               printf("backup footer invalid: %s\n", msg);
+               goto out;
+       }
+
+       if (memcmp(&primary, &backup, sizeof(primary))) {
+               if (ctx->opts.ignore_footer) {
+                       memcpy(&primary, &backup, sizeof(primary));
+                       goto ok;
+               }
+
+               if (backup.hidden &&
+                   !strncmp(backup.crtr_app, "tap", 3) &&
+                   (backup.crtr_ver == VHD_VERSION(0, 1) ||
+                    backup.crtr_ver == VHD_VERSION(1, 1))) {
+                       char cmp, tmp = backup.hidden;
+                       backup.hidden = 0;
+                       cmp = memcmp(&primary, &backup, sizeof(primary));
+                       backup.hidden = tmp;
+                       if (!cmp)
+                               goto ok;
+               }
+
+               err = -EINVAL;
+               printf("primary and backup footers do not match\n");
+               goto out;
+       }
+
+ok:
+       err = 0;
+       memcpy(footer, &primary, sizeof(primary));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_header(int fd, vhd_footer_t *footer)
+{
+       int err;
+       off64_t off;
+       char *msg;
+       void *buf;
+       vhd_header_t header;
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(header));
+       if (err) {
+               printf("error allocating header: %d\n", err);
+               return err;
+       }
+
+       off = footer->data_offset;
+       off = lseek64(fd, off, SEEK_SET);
+       if (off == (off64_t)-1) {
+               err = -errno;
+               printf("error seeking to header: %d\n", err);
+               goto out;
+       }
+
+       err = read(fd, buf, sizeof(header));
+       if (err != sizeof(header)) {
+               err = (errno ? -errno : -EIO);
+               printf("error reading header: %d\n", err);
+               goto out;
+       }
+
+       memcpy(&header, buf, sizeof(header));
+       vhd_header_in(&header);
+
+       msg = vhd_util_check_validate_header(fd, &header);
+       if (msg) {
+               err = -EINVAL;
+               printf("header is invalid: %s\n", msg);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_check_differencing_header(struct vhd_util_check_ctx *ctx,
+                                  vhd_context_t *vhd)
+{
+       char *msg;
+
+       msg = vhd_util_check_validate_differencing_header(ctx, vhd);
+       if (msg) {
+               printf("differencing header is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_bitmap(struct vhd_util_check_ctx *ctx,
+                     vhd_context_t *vhd, uint32_t block)
+{
+       int err, i;
+       uint64_t sector;
+       char *bitmap, *data;
+
+       data   = NULL;
+       bitmap = NULL;
+       sector = (uint64_t)block * vhd->spb;
+
+       err = vhd_read_bitmap(vhd, block, &bitmap);
+       if (err) {
+               printf("error reading bitmap 0x%x\n", block);
+               goto out;
+       }
+
+       if (ctx->opts.check_data) {
+               err = vhd_read_block(vhd, block, &data);
+               if (err) {
+                       printf("error reading data block 0x%x\n", block);
+                       goto out;
+               }
+       }
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (ctx->opts.collect_stats &&
+                   vhd_bitmap_test(vhd, bitmap, i)) {
+                       ctx_cur_stats(ctx)->secs_written++;
+                       set_bit_u64(ctx_cur_stats(ctx)->bitmap, sector + i);
+               }
+
+               if (ctx->opts.check_data) {
+                       char *buf = data + (i << VHD_SECTOR_SHIFT);
+                       int set   = vhd_util_check_zeros(buf, VHD_SECTOR_SIZE);
+                       int map   = vhd_bitmap_test(vhd, bitmap, i);
+
+                       if (set && !map) {
+                               printf("sector 0x%x of block 0x%x has data "
+                                      "where bitmap is clear\n", i, block);
+                               err = -EINVAL;
+                       }
+               }
+       }
+
+out:
+       free(data);
+       free(bitmap);
+       return err;
+}
+
+static int
+vhd_util_check_bat(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd)
+{
+       off64_t eof, eoh;
+       uint64_t vhd_blks;
+       int i, j, err, block_size;
+
+       if (ctx->opts.collect_stats) {
+               err = vhd_util_check_stats_alloc_one(ctx, vhd);
+               if (err)
+                       return err;
+       }
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err) {
+               printf("error calculating eof: %d\n", err);
+               return err;
+       }
+
+       eof = vhd_position(vhd);
+       if (eof == (off64_t)-1) {
+               printf("error calculating eof: %d\n", -errno);
+               return -errno;
+       }
+
+       /* adjust eof for vhds with short footers */
+       if (eof % 512) {
+               if (eof % 512 != 511) {
+                       printf("invalid file size: 0x%"PRIx64"\n", eof);
+                       return -EINVAL;
+               }
+
+               eof++;
+       }
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_end_of_headers(vhd, &eoh);
+       if (err) {
+               printf("error calculating end of metadata: %d\n", err);
+               return err;
+       }
+
+       eof  -= sizeof(vhd_footer_t);
+       eof >>= VHD_SECTOR_SHIFT;
+       eoh >>= VHD_SECTOR_SHIFT;
+       block_size = vhd->spb + vhd->bm_secs;
+
+       vhd_blks = vhd->footer.curr_size >> VHD_BLOCK_SHIFT;
+       if (vhd_blks > vhd->header.max_bat_size) {
+               printf("VHD size (%"PRIu64" blocks) exceeds BAT size (%u)\n",
+                      vhd_blks, vhd->header.max_bat_size);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < vhd_blks; i++) {
+               uint32_t off = vhd->bat.bat[i];
+               if (off == DD_BLK_UNUSED)
+                       continue;
+
+               if (off < eoh) {
+                       printf("block %d (offset 0x%x) clobbers headers\n",
+                              i, off);
+                       return -EINVAL;
+               }
+
+               if (off + block_size > eof) {
+                       if (!(ctx->primary_footer_missing &&
+                             ctx->opts.ignore_footer     &&
+                             off + block_size == eof + 1)) {
+                               printf("block %d (offset 0x%x) clobbers "
+                                      "footer\n", i, off);
+                               return -EINVAL;
+                       }
+               }
+
+               if (ctx->opts.no_check_bat)
+                       continue;
+
+               for (j = 0; j < vhd_blks; j++) {
+                       uint32_t joff = vhd->bat.bat[j];
+
+                       if (i == j)
+                               continue;
+
+                       if (joff == DD_BLK_UNUSED)
+                               continue;
+
+                       if (off == joff)
+                               err = -EINVAL;
+
+                       if (off > joff && off < joff + block_size)
+                               err = -EINVAL;
+
+                       if (off + block_size > joff &&
+                           off + block_size < joff + block_size)
+                               err = -EINVAL;
+
+                       if (err) {
+                               printf("block %d (offset 0x%x) clobbers "
+                                      "block %d (offset 0x%x)\n",
+                                      i, off, j, joff);
+                               return err;
+                       }
+               }
+
+               if (ctx->opts.check_data || ctx->opts.collect_stats) {
+                       if (ctx->opts.collect_stats)
+                               ctx_cur_stats(ctx)->secs_allocated += vhd->spb;
+
+                       err = vhd_util_check_bitmap(ctx, vhd, i);
+                       if (err)
+                               return err;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_batmap(vhd_context_t *vhd)
+{
+       char *msg;
+       int i, err;
+
+       err = vhd_get_bat(vhd);
+       if (err) {
+               printf("error reading bat: %d\n", err);
+               return err;
+       }
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("error reading batmap: %d\n", err);
+               return err;
+       }
+
+       msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap);
+       if (msg) {
+               printf("batmap is invalid: %s\n", msg);
+               return -EINVAL;
+       }
+
+       for (i = 0; i < vhd->footer.curr_size >> VHD_BLOCK_SHIFT; i++) {
+               if (!vhd_batmap_test(vhd, &vhd->batmap, i))
+                       continue;
+
+               if (vhd->bat.bat[i] == DD_BLK_UNUSED) {
+                       printf("batmap shows unallocated block %d full\n", i);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_check_parent_locators(struct vhd_util_check_ctx *ctx,
+                              vhd_context_t *vhd)
+{
+       int i, n, err;
+       vhd_parent_locator_t *loc;
+       char *msg, *file, *ppath, *location, *pname;
+       int mac, macx, w2ku, w2ru, wi2r, wi2k, found;
+
+       mac      = 0;
+       macx     = 0;
+       w2ku     = 0;
+       w2ru     = 0;
+       wi2r     = 0;
+       wi2k     = 0;
+       found    = 0;
+       pname    = NULL;
+       ppath    = NULL;
+       location = NULL;
+
+       err = vhd_header_decode_parent(vhd, &vhd->header, &pname);
+       if (err) {
+               printf("error decoding parent name: %d\n", err);
+               return err;
+       }
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]);
+       for (i = 0; i < n; i++) {
+               ppath    = NULL;
+               location = NULL;
+               loc = vhd->header.loc + i;
+
+               msg = vhd_util_check_validate_parent_locator(vhd, loc);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent locator %d: %s\n", i, msg);
+                       goto out;
+               }
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               switch (loc->code) {
+               case PLAT_CODE_MACX:
+                       if (macx++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_MAC:
+                       if (mac++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2KU:
+                       if (w2ku++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_W2RU:
+                       if (w2ru++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2R:
+                       if (wi2r++)
+                               goto dup;
+                       break;
+
+               case PLAT_CODE_WI2K:
+                       if (wi2k++)
+                               goto dup;
+                       break;
+
+               default:
+                       err = -EINVAL;
+                       printf("invalid  platform code for locator %d\n", i);
+                       goto out;
+               }
+
+               if (loc->code != PLAT_CODE_MACX &&
+                   loc->code != PLAT_CODE_W2RU &&
+                   loc->code != PLAT_CODE_W2KU)
+                       continue;
+
+               err = vhd_parent_locator_read(vhd, loc, &ppath);
+               if (err) {
+                       printf("error reading parent locator %d: %d\n", i, err);
+                       goto out;
+               }
+
+               file = basename(ppath);
+               if (strcmp(pname, file)) {
+                       err = -EINVAL;
+                       printf("parent locator %d name (%s) does not match "
+                              "header name (%s)\n", i, file, pname);
+                       goto out;
+               }
+
+               err = vhd_find_parent(vhd, ppath, &location);
+               if (err) {
+                       printf("error resolving %s: %d\n", ppath, err);
+                       goto out;
+               }
+
+               err = access(location, R_OK);
+               if (err && loc->code == PLAT_CODE_MACX) {
+                       err = -errno;
+                       printf("parent locator %d points to missing file %s "
+                               "(resolved to %s)\n", i, ppath, location);
+                       goto out;
+               }
+
+               msg = vhd_util_check_validate_parent(ctx, vhd, location);
+               if (msg) {
+                       err = -EINVAL;
+                       printf("invalid parent %s: %s\n", location, msg);
+                       goto out;
+               }
+
+               found++;
+               free(ppath);
+               free(location);
+               ppath = NULL;
+               location = NULL;
+
+               continue;
+
+       dup:
+               printf("duplicate platform code in locator %d: 0x%x\n",
+                      i, loc->code);
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (!found) {
+               err = -EINVAL;
+               printf("could not find parent %s\n", pname);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(pname);
+       free(ppath);
+       free(location);
+       return err;
+}
+
+static void
+vhd_util_dump_headers(const char *name)
+{
+       char *argv[] = { "read", "-p", "-n", (char *)name };
+       int argc = sizeof(argv) / sizeof(argv[0]);
+
+       printf("%s appears invalid; dumping metadata\n", name);
+       vhd_util_read(argc, argv);
+}
+
+static int
+vhd_util_check_vhd(struct vhd_util_check_ctx *ctx, const char *name)
+{
+       int fd, err;
+       vhd_context_t vhd;
+       struct stat stats;
+       vhd_footer_t footer;
+
+       fd = -1;
+       memset(&vhd, 0, sizeof(vhd));
+       memset(&footer, 0, sizeof(footer));
+
+       err = stat(name, &stats);
+       if (err == -1) {
+               printf("cannot stat %s: %d\n", name, errno);
+               return -errno;
+       }
+
+       if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) {
+               printf("%s is not a regular file or block device\n", name);
+               return -EINVAL;
+       }
+
+       fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (fd == -1) {
+               printf("error opening %s\n", name);
+               return -errno;
+       }
+
+       err = vhd_util_check_footer(ctx, fd, &footer);
+       if (err)
+               goto out;
+
+       if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF)
+               goto out;
+
+       err = vhd_util_check_header(fd, &footer);
+       if (err)
+               goto out;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_differencing_header(ctx, &vhd);
+       if (err)
+               goto out;
+
+       err = vhd_util_check_bat(ctx, &vhd);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_util_check_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (vhd.footer.type == HD_TYPE_DIFF) {
+               err = vhd_util_check_parent_locators(ctx, &vhd);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+       if (!ctx->opts.collect_stats)
+               printf("%s is valid\n", name);
+
+out:
+       if (err)
+               vhd_util_dump_headers(name);
+       if (fd != -1)
+               close(fd);
+       vhd_close(&vhd);
+       return err;
+}
+
+static int
+vhd_util_check_parents(struct vhd_util_check_ctx *ctx, const char *name)
+{
+       int err;
+       vhd_context_t vhd;
+       char *cur, *parent;
+
+       cur = (char *)name;
+
+       for (;;) {
+               err = vhd_open(&vhd, cur, 
+                               VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+               if (err)
+                       goto out;
+
+               if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) {
+                       vhd_close(&vhd);
+                       goto out;
+               }
+
+               err = vhd_parent_locator_get(&vhd, &parent);
+               vhd_close(&vhd);
+
+               if (err) {
+                       printf("error getting parent: %d\n", err);
+                       goto out;
+               }
+
+               if (cur != name)
+                       free(cur);
+               cur = parent;
+
+               err = vhd_util_check_vhd(ctx, cur);
+               if (err)
+                       goto out;
+       }
+
+out:
+       if (err)
+               printf("error checking parents: %d\n", err);
+       if (cur != name)
+               free(cur);
+       return err;
+}
+
+int
+vhd_util_check(int argc, char **argv)
+{
+       char *name;
+       int c, err, parents;
+       struct vhd_util_check_ctx ctx;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       name    = NULL;
+       parents = 0;
+       memset(&ctx, 0, sizeof(ctx));
+       vhd_util_check_stats_init(&ctx);
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:iItpbBsh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'i':
+                       ctx.opts.ignore_footer = 1;
+                       break;
+               case 'I':
+                       ctx.opts.ignore_parent_uuid = 1;
+                       break;
+               case 't':
+                       ctx.opts.ignore_timestamps = 1;
+                       break;
+               case 'p':
+                       parents = 1;
+                       break;
+               case 'b':
+                       ctx.opts.check_data = 1;
+                       break;
+               case 'B':
+                       ctx.opts.no_check_bat = 1;
+                       break;
+               case 's':
+                       ctx.opts.collect_stats = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       if ((ctx.opts.collect_stats || ctx.opts.check_data) &&
+                       ctx.opts.no_check_bat) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_util_check_vhd(&ctx, name);
+       if (err)
+               goto out;
+
+       if (parents)
+               err = vhd_util_check_parents(&ctx, name);
+
+       if (ctx.opts.collect_stats)
+               vhd_util_check_stats_print(&ctx);
+
+       vhd_util_check_stats_free(&ctx);
+
+out:
+       return err;
+
+usage:
+       printf("options: -n <file> [-i ignore missing primary footers] "
+              "[-I ignore parent uuids] [-t ignore timestamps] "
+              "[-B do not check BAT for overlapping (precludes -s, -b)] "
+              "[-p check parents] [-b check bitmaps] [-s stats] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-coalesce.c 
b/tools/blktap3/vhd/lib/vhd-util-coalesce.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-coalesce.c
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "libvhd.h"
+
+static int
+__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs)
+{
+       off64_t off;
+       size_t ret;
+
+       errno = 0;
+       off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off64_t)-1) {
+               printf("raw parent: seek(0x%08"PRIx64") failed: %d\n",
+                      vhd_sectors_to_bytes(sec), -errno);
+               return -errno;
+       }
+
+       ret = write(fd, buf, vhd_sectors_to_bytes(secs));
+       if (ret == vhd_sectors_to_bytes(secs))
+               return 0;
+
+       printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n",
+              vhd_sectors_to_bytes(secs), ret, -errno);
+       return (errno ? -errno : -EIO);
+}
+
+/*
+ * Use 'parent' if the parent is VHD, and 'parent_fd' if the parent is raw
+ */
+static int
+vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent,
+                       int parent_fd, uint64_t block)
+{
+       int i, err;
+       void *buf;
+       char *map;
+       uint64_t sec, secs;
+
+       buf = NULL;
+       map = NULL;
+       sec = block * vhd->spb;
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = posix_memalign(&buf, 4096, vhd->header.block_size);
+       if (err)
+               return -err;
+
+       err = vhd_io_read(vhd, buf, sec, vhd->spb);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) {
+               if (parent->file)
+                       err = vhd_io_write(parent, buf, sec, vhd->spb);
+               else
+                       err = __raw_io_write(parent_fd, buf, sec, vhd->spb);
+               goto done;
+       }
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto done;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               for (secs = 0; i + secs < vhd->spb; secs++)
+                       if (!vhd_bitmap_test(vhd, map, i + secs))
+                               break;
+
+               if (parent->file)
+                       err = vhd_io_write(parent,
+                                          buf + vhd_sectors_to_bytes(i),
+                                          sec + i, secs);
+               else
+                       err = __raw_io_write(parent_fd,
+                                            buf + vhd_sectors_to_bytes(i),
+                                            sec + i, secs);
+               if (err)
+                       goto done;
+
+               i += secs;
+       }
+
+       err = 0;
+
+done:
+       free(buf);
+       free(map);
+       return err;
+}
+
+static int
+vhd_util_coalesce_onto(vhd_context_t *from,
+                      vhd_context_t *to, int to_fd, int progress)
+{
+       int err;
+       uint64_t i;
+
+       err = vhd_get_bat(from);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(from)) {
+               err = vhd_get_batmap(from);
+               if (err)
+                       goto out;
+       }
+
+       for (i = 0; i < from->bat.entries; i++) {
+               if (progress) {
+                       printf("\r%6.2f%%",
+                              ((float)i / (float)from->bat.entries) * 100.00);
+                       fflush(stdout);
+               }
+               err = vhd_util_coalesce_block(from, to, to_fd, i);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+       if (progress)
+               printf("\r100.00%%\n");
+
+out:
+       return err;
+}
+
+static int
+vhd_util_coalesce_parent(const char *name, int sparse, int progress)
+{
+       char *pname;
+       int err, parent_fd;
+       vhd_context_t vhd, parent;
+
+       parent_fd   = -1;
+       parent.file = NULL;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_parent_locator_get(&vhd, &pname);
+       if (err) {
+               printf("error finding %s parent: %d\n", name, err);
+               vhd_close(&vhd);
+               return err;
+       }
+
+       if (vhd_parent_raw(&vhd)) {
+               parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644);
+               if (parent_fd == -1) {
+                       err = -errno;
+                       printf("failed to open parent %s: %d\n", pname, err);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       } else {
+               int flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0);
+               if (sparse) printf("opening for sparse writes\n");
+               err = vhd_open(&parent, pname, VHD_OPEN_RDWR | flags);
+               if (err) {
+                       printf("error opening %s: %d\n", pname, err);
+                       free(pname);
+                       vhd_close(&vhd);
+                       return err;
+               }
+       }
+
+       err = vhd_util_coalesce_onto(&vhd, &parent, parent_fd, progress);
+
+       free(pname);
+       vhd_close(&vhd);
+       if (parent.file)
+               vhd_close(&parent);
+       else
+               close(parent_fd);
+       return err;
+}
+
+TAILQ_HEAD(tqh_vhd_list_entry, vhd_list_entry);
+struct vhd_list_entry {
+       int                raw;
+       int                raw_fd;
+       vhd_context_t      vhd;
+       TAILQ_ENTRY(vhd_list_entry) next;
+};
+
+static int
+vhd_util_pathcmp(const char *a, const char *b, int *cmp)
+{
+       int err;
+       char *apath = NULL, __apath[PATH_MAX];
+       char *bpath = NULL, __bpath[PATH_MAX];
+
+       apath = realpath(a, __apath);
+       if (!apath) {
+               err = -errno;
+               goto out;
+       }
+
+       bpath = realpath(b, __bpath);
+       if (!bpath) {
+               err = -errno;
+               goto out;
+       }
+
+       *cmp = strcmp(apath, bpath);
+       err  = 0;
+
+out:
+       return err;
+}
+
+static void
+vhd_util_coalesce_free_chain(struct tqh_vhd_list_entry *head)
+{
+       struct vhd_list_entry *entry, *tmp;
+
+       TAILQ_FOREACH_SAFE(entry, head, next, tmp) {
+               if (entry->raw)
+                       close(entry->raw_fd);
+               else
+                       vhd_close(&entry->vhd);
+               TAILQ_REMOVE(head, entry, next);
+               free(entry);
+       }
+
+       TAILQ_INIT(head);
+}
+
+static int
+vhd_util_coalesce_load_chain(struct tqh_vhd_list_entry *head,
+                            const char *cname, const char *aname, int sparse)
+{
+       char *next;
+       vhd_context_t *child;
+       int err, cmp, vhd_flags;
+       struct vhd_list_entry *entry;
+
+       next  = NULL;
+       entry = NULL;
+       TAILQ_INIT(head);
+
+       vhd_flags = VHD_OPEN_RDWR | (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0);
+
+       err = vhd_util_pathcmp(cname, aname, &cmp);
+       if (err)
+               goto out;
+
+       if (!cmp) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       entry = calloc(1, sizeof(*entry));
+       if (!entry)
+               goto out;
+
+       err = vhd_open(&entry->vhd, cname, vhd_flags);
+       if (err)
+               goto out;
+
+       err = vhd_get_bat(&entry->vhd);
+       if (err)
+               goto out;
+
+       if (vhd_has_batmap(&entry->vhd)) {
+               err = vhd_get_batmap(&entry->vhd);
+               if (err)
+                       goto out;
+       }
+
+       child = &entry->vhd;
+       TAILQ_INSERT_TAIL(head, entry, next);
+
+       for (;;) {
+               int raw;
+
+               if (entry->raw || entry->vhd.footer.type != HD_TYPE_DIFF) {
+                       err = -ENOENT;
+                       goto out;
+               }
+
+               if (child->header.block_size != entry->vhd.header.block_size) {
+                       err = -EINVAL;
+                       goto out;
+               }
+
+               err = vhd_parent_locator_get(&entry->vhd, &next);
+               if (err)
+                       goto out;
+
+               raw = vhd_parent_raw(&entry->vhd);
+
+               entry = calloc(1, sizeof(*entry));
+               if (!entry)
+                       goto out;
+
+               if (raw) {
+                       entry->raw = raw;
+                       entry->raw_fd = open(next,
+                                            O_RDWR | O_DIRECT | O_LARGEFILE);
+                       if (entry->raw_fd == -1) {
+                               err = -errno;
+                               goto out;
+                       }
+               } else {
+                       err = vhd_open(&entry->vhd, next, vhd_flags);
+                       if (err)
+                               goto out;
+
+                       err = vhd_get_bat(&entry->vhd);
+                       if (err)
+                               goto out;
+
+                       if (vhd_has_batmap(&entry->vhd)) {
+                               err = vhd_get_batmap(&entry->vhd);
+                               if (err)
+                                       goto out;
+                       }
+               }
+
+               TAILQ_INSERT_HEAD(head, entry, next);
+
+               err = vhd_util_pathcmp(next, aname, &cmp);
+               if (err)
+                       goto out;
+
+               if (!cmp)
+                       goto done;
+
+               free(next);
+               next = NULL;
+       }
+
+done:
+       err = 0;
+out:
+       if (err) {
+               if (entry && TAILQ_EMPTY(head)) {
+                       if (entry->vhd.file)
+                               vhd_close(&entry->vhd);
+                       else if (entry->raw)
+                               close(entry->raw_fd);
+                       free(entry);
+               }
+               vhd_util_coalesce_free_chain(head);
+       }
+       return err;
+}
+
+static int
+vhd_util_coalesce_clear_bitmap(vhd_context_t *child, char *cmap,
+                              vhd_context_t *ancestor, const uint64_t block)
+{
+       char *amap = NULL;
+       int i, dirty, err;
+
+       if (child->spb != ancestor->spb) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       if (block >= ancestor->bat.entries)
+               goto done;
+
+       if (ancestor->bat.bat[block] == DD_BLK_UNUSED)
+               goto done;
+
+       err = vhd_read_bitmap(ancestor, block, &amap);
+       if (err)
+               goto out;
+
+       for (i = 0; i < child->spb; i++) {
+               if (vhd_bitmap_test(child, cmap, i)) {
+                       if (vhd_bitmap_test(ancestor, amap, i)) {
+                               dirty = 1;
+                               vhd_bitmap_clear(ancestor, amap, i);
+                       }
+               }
+       }
+
+       if (dirty) {
+               err = vhd_write_bitmap(ancestor, block, amap);
+               if (err)
+                       goto out;
+               if (vhd_has_batmap(ancestor) &&
+                   vhd_batmap_test(ancestor, &ancestor->batmap, block)) {
+                       vhd_batmap_clear(ancestor, &ancestor->batmap, block);
+                       err = vhd_write_batmap(ancestor, &ancestor->batmap);
+                       if (err)
+                               goto out;
+               }
+       }
+
+done:
+       err = 0;
+out:
+       free(amap);
+       return err;
+}
+
+static int
+vhd_util_coalesce_clear_bitmaps(struct tqh_vhd_list_entry *chain,
+        vhd_context_t *child, vhd_context_t *ancestor, uint64_t block)
+{
+       int err;
+       char *map = NULL;
+       struct vhd_list_entry *entry;
+
+       if (child->bat.bat[block] == DD_BLK_UNUSED)
+               goto done;
+
+       err = vhd_read_bitmap(child, block, &map);
+       if (err)
+               goto out;
+
+       TAILQ_FOREACH(entry, chain, next) {
+               if (&entry->vhd == child)
+                       continue;
+               if (&entry->vhd == ancestor)
+                       break;
+               err = vhd_util_coalesce_clear_bitmap(child, map,
+                                                    &entry->vhd, block);
+               if (err)
+                       goto out;
+       }
+
+done:
+       err = 0;
+out:
+       free(map);
+       return err;
+}
+
+static int
+vhd_util_coalesce_ancestor(const char *cname,
+                          const char *aname, int sparse, int progress)
+{
+       uint64_t i;
+       int err, raw_fd;
+       struct tqh_vhd_list_entry chain;
+       struct vhd_list_entry *entry;
+       vhd_context_t *child, *ancestor;
+
+       child    = NULL;
+       ancestor = NULL;
+
+       err = vhd_util_coalesce_load_chain(&chain, cname, aname, sparse);
+       if (err)
+               goto out;
+
+       TAILQ_FOREACH(entry, &chain, next) {
+               if (!child)
+                       child = &entry->vhd;
+               else if (TAILQ_LAST(&chain, tqh_vhd_list_entry) == entry) {
+                       ancestor = &entry->vhd;
+                       raw_fd = entry->raw_fd;
+                       break;
+               }
+       }
+
+       if (!ancestor) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = vhd_util_coalesce_onto(child, ancestor, raw_fd, progress);
+       if (err)
+               goto out;
+
+       for (i = 0; i < child->bat.entries; i++) {
+               err = vhd_util_coalesce_clear_bitmaps(&chain,
+                                                     child, ancestor, i);
+               if (err)
+                       goto out;
+       }
+
+out:
+       vhd_util_coalesce_free_chain(&chain);
+       return err;
+}
+
+static int
+vhd_util_coalesce_open_output(vhd_context_t *dst,
+                             vhd_context_t *src, const char *name, int flags)
+{
+       int err;
+
+       err = access(name, F_OK);
+       if (!err) {
+               printf("%s already exists\n", name);
+               return -EEXIST;
+       } else if (errno != ENOENT) {
+               printf("error checking %s: %d\n", name, errno);
+               return -errno;
+       }
+
+       err = vhd_create(name, src->footer.curr_size, HD_TYPE_DYNAMIC, 0, 0);
+       if (err) {
+               printf("error creating %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_open(dst, name, VHD_OPEN_RDWR | flags);
+       if (err || dst->header.block_size != src->header.block_size) {
+               printf("error opening %s: %d\n", name, (err ? : EINVAL));
+               unlink(name);
+               return err ? : EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ * read block from @src chain and write it to @dst, unless it is all zeros
+ */
+static int
+vhd_util_coalesce_block_out(vhd_context_t *dst,
+                           vhd_context_t *src, uint64_t block)
+{
+       int i, err;
+       uint64_t sec;
+       void *buf;
+       char *p;
+
+       buf = NULL;
+       sec = block * src->spb;
+
+       err = posix_memalign(&buf, 4096, src->header.block_size);
+       if (err)
+               return -err;
+
+       err = vhd_io_read(src, buf, sec, src->spb);
+       if (err)
+               goto done;
+
+       for (p = buf, i = 0; i < src->header.block_size; i++, p++) {
+               if (*p) {
+                       err = vhd_io_write(dst, buf, sec, src->spb);
+                       break;
+               }
+       }
+
+done:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_util_coalesce_out(const char *src_name, const char *dst_name,
+                     int sparse, int progress)
+{
+       uint64_t i;
+       int err, flags;
+       vhd_context_t src, dst;
+
+       err = vhd_open(&src, src_name, VHD_OPEN_RDONLY | VHD_OPEN_CACHED);
+       if (err)
+               return err;
+
+       flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0);
+       err = vhd_util_coalesce_open_output(&dst, &src, dst_name, flags);
+       if (err) {
+               vhd_close(&src);
+               return err;
+       }
+
+       err = vhd_get_bat(&src);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(&src)) {
+               err = vhd_get_batmap(&src);
+               if (err)
+                       goto done;
+       }
+
+       for (i = 0; i < src.bat.entries; i++) {
+               if (progress) {
+                       printf("\r%6.2f%%",
+                              ((float)i / (float)src.bat.entries) * 100.0);
+                       fflush(stdout);
+               }
+               err = vhd_util_coalesce_block_out(&dst, &src, i);
+               if (err)
+                       goto done;
+       }
+
+       err = 0;
+
+       if (progress)
+               printf("\r100.00%%\n");
+
+done:
+       if (err)
+               unlink(dst.file);
+       vhd_close(&src);
+       vhd_close(&dst);
+       return err;
+}
+
+int
+vhd_util_coalesce(int argc, char **argv)
+{
+       char *name, *oname, *ancestor;
+       int err, c, progress, sparse;
+
+       name      = NULL;
+       oname     = NULL;
+       ancestor  = NULL;
+       sparse    = 0;
+       progress  = 0;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:o:a:sph")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'o':
+                       oname = optarg;
+                       break;
+               case 'a':
+                       ancestor = optarg;
+                       break;
+               case 's':
+                       sparse = 1;
+                       break;
+               case 'p':
+                       progress = 1;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       if (oname && ancestor)
+               goto usage;
+
+       if (oname)
+               err = vhd_util_coalesce_out(name, oname, sparse, progress);
+       else if (ancestor)
+               err = vhd_util_coalesce_ancestor(name, ancestor,
+                                                sparse, progress);
+       else
+               err = vhd_util_coalesce_parent(name, sparse, progress);
+
+       if (err)
+               printf("error coalescing: %d\n", err);
+
+       return err;
+
+usage:
+       printf("options: <-n name> [-a ancestor] "
+              "[-o output] [-s sparse] [-p progress] [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-create.c 
b/tools/blktap3/vhd/lib/vhd-util-create.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-create.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_create(int argc, char **argv)
+{
+       char *name;
+       uint64_t size, msize;
+       int c, sparse, err;
+       vhd_flag_creat_t flags;
+
+       err       = -EINVAL;
+       size      = 0;
+       msize     = 0;
+       sparse    = 1;
+       name      = NULL;
+       flags     = 0;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:S:rh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'S':
+                       err = 0;
+                       msize = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       sparse = 0;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || optind != argc)
+               goto usage;
+
+       if (msize && msize < size) {
+               printf("Error: <-S size> must be greater than <-s size>\n");
+               return -EINVAL;
+       }
+
+       return vhd_create(name, size << 20,
+                                 (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED),
+                                 msize << 20, flags);
+
+usage:
+       printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help] "
+                       "[<-S size (MB) for metadata preallocation "
+                       "(see vhd-util resize)>]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-fill.c 
b/tools/blktap3/vhd/lib/vhd-util-fill.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-fill.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_fill(int argc, char **argv)
+{
+       int err, c;
+       char *name;
+       void *buf;
+       vhd_context_t vhd;
+       uint64_t i, sec, secs;
+
+       buf  = NULL;
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       err = posix_memalign(&buf, 4096, vhd.header.block_size);
+       if (err) {
+               err = -err;
+               goto done;
+       }
+
+       sec  = 0;
+       secs = vhd.header.block_size >> VHD_SECTOR_SHIFT;
+
+       for (i = 0; i < vhd.header.max_bat_size; i++) {
+               err = vhd_io_read(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               err = vhd_io_write(&vhd, buf, sec, secs);
+               if (err)
+                       goto done;
+
+               sec += secs;
+       }
+
+       err = 0;
+
+ done:
+       free(buf);
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-modify.c 
b/tools/blktap3/vhd/lib/vhd-util-modify.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-modify.c
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+TEST_FAIL_EXTERN_VARS;
+
+static int
+vhd_util_zero_bat(vhd_context_t *vhd)
+{
+       int err, map_bytes;
+       uint64_t i;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       for (i = 0; i < vhd->bat.entries; i++)
+               vhd->bat.bat[i] = DD_BLK_UNUSED;
+       err = vhd_write_bat(vhd, &vhd->bat);
+       if (err)
+               return err;
+
+       map_bytes = ((vhd->footer.curr_size >> VHD_SECTOR_SHIFT) /
+                       vhd->spb) >> 3;
+       map_bytes = vhd_sectors_to_bytes(secs_round_up_no_zero(map_bytes));
+       memset(vhd->batmap.map, 0, map_bytes);
+       return vhd_write_batmap(vhd, &vhd->batmap);
+}
+
+int
+vhd_util_modify(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int err, c, size, parent, parent_raw, kill_data;
+       off64_t newsize = 0;
+       char *newparent = NULL;
+
+       name       = NULL;
+       size       = 0;
+       parent     = 0;
+       parent_raw = 0;
+       kill_data  = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:p:mzh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 's':
+                       size = 1;
+                       errno = 0;
+                       newsize = strtoll(optarg, NULL, 10);
+                       if (errno) {
+                               fprintf(stderr, "Invalid size '%s'\n", optarg);
+                               goto usage;
+                       }
+                       break;
+               case 'p':
+                       parent = 1;
+                       newparent = optarg;
+                       break;
+               case 'm':
+                       parent_raw = 1;
+                       break;
+               case 'z':
+                       kill_data = 1;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (kill_data) {
+               if (vhd_type_dynamic(&vhd))
+                       err = vhd_util_zero_bat(&vhd);
+               else
+                       err = -ENOSYS;
+
+               if (!err && !vhd.is_block) // truncate file-based VHDs
+                       err = vhd_write_footer(&vhd, &vhd.footer);
+
+               if (err)
+                       printf("failed to zero VHD: %d\n", err);
+       }
+
+       if (size) {
+               err = vhd_set_phys_size(&vhd, newsize);
+               if (err)
+                       printf("failed to set physical size to %"PRIu64":"
+                              " %d\n", newsize, err);
+       }
+
+       if (parent) {
+               TEST_FAIL_AT(FAIL_REPARENT_BEGIN);
+               err = vhd_change_parent(&vhd, newparent, parent_raw);
+               if (err) {
+                       printf("failed to set parent to '%s': %d\n",
+                                       newparent, err);
+                       goto done;
+               }
+               TEST_FAIL_AT(FAIL_REPARENT_END);
+       }
+
+done:
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("*** Dangerous operations, use with care ***\n");
+       printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] "
+                       "[-s NEW_SIZE set size] [-z zero (kill data)] "
+                       "[-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-query.c 
b/tools/blktap3/vhd/lib/vhd-util-query.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-query.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_query(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       off64_t currsize;
+       int ret, err, c, size, physize, parent, fields, depth, fastresize, 
marker;
+
+       name       = NULL;
+       size       = 0;
+       physize    = 0;
+       parent     = 0;
+       fields     = 0;
+       depth      = 0;
+       fastresize = 0;
+       marker     = 0;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:vspfdSmh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'v':
+                       size = 1;
+                       break;
+               case 's':
+                       physize = 1;
+                       break;
+               case 'p':
+                       parent = 1;
+                       break;
+               case 'f':
+                       fields = 1;
+                       break;
+               case 'd':
+                       depth = 1;
+                       break;
+               case 'S':
+                       fastresize = 1;
+                       break;
+               case 'm':
+                       marker = 1;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (size)
+               printf("%"PRIu64"\n", vhd.footer.curr_size >> 20);
+
+       if (physize) {
+               err = vhd_get_phys_size(&vhd, &currsize);
+               if (err)
+                       printf("failed to get physical size: %d\n", err);
+               else
+                       printf("%"PRIu64"\n", currsize);
+       }
+
+       if (parent) {
+               ret = 0;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       printf("%s has no parent\n", name);
+               else {
+                       char *pname;
+
+                       ret = vhd_parent_locator_get(&vhd, &pname);
+                       if (ret)
+                               printf("query failed\n");
+                       else {
+                               printf("%s\n", pname);
+                               free(pname);
+                       }
+               }
+
+               err = (err ? : ret);
+       }
+
+       if (fields) {
+               int hidden;
+
+               ret = vhd_hidden(&vhd, &hidden);
+               if (ret)
+                       printf("error checking 'hidden' field: %d\n", ret);
+               else
+                       printf("hidden: %d\n", hidden);
+
+               err = (err ? : ret);
+       }
+
+       if (marker) {
+               char marker;
+
+               ret = vhd_marker(&vhd, &marker);
+               if (ret)
+                       printf("error checking 'marker' field: %d\n", ret);
+               else
+                       printf("marker: %d\n", marker);
+
+               err = (err ? : ret);
+       }
+
+       if (depth) {
+               int length;
+
+               ret = vhd_chain_depth(&vhd, &length);
+               if (ret)
+                       printf("error checking chain depth: %d\n", ret);
+               else
+                       printf("chain depth: %d\n", length);
+
+               err = (err ? : ret);
+       }
+
+       if (fastresize) {
+               uint64_t max_size;
+
+               max_size = vhd.header.max_bat_size << (VHD_BLOCK_SHIFT - 20);
+               printf("%"PRIu64"\n", max_size);
+       }
+               
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-v print virtual size (in MB)] "
+              "[-s print physical utilization (bytes)] [-p print parent] "
+              "[-f print fields] [-m print marker] [-d print chain depth] "
+              "[-S print max virtual size (MB) for fast resize] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-read.c 
b/tools/blktap3/vhd/lib/vhd-util-read.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-read.c
@@ -0,0 +1,937 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#define nsize     15
+static char nbuf[nsize];
+
+static inline char *
+__xconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%#" PRIx64 , num);
+       return nbuf;
+}
+
+static inline char *
+__dconv(uint64_t num)
+{
+       snprintf(nbuf, nsize, "%" PRIu64, num);
+       return nbuf;
+}
+
+#define conv(hex, num) \
+       (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num))
+
+static void
+vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex)
+{
+       int err;
+       uint32_t  cksm;
+       char      uuid[37], time_str[26], cookie[9], *name;
+
+       printf("VHD Header Summary:\n-------------------\n");
+
+       snprintf(cookie, 9, "%s", h->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset));
+       printf("Table offset        : %s\n", conv(hex, h->table_offset));
+       printf("Header version      : 0x%08x\n", h->hdr_ver);
+       printf("Max BAT size        : %s\n", conv(hex, h->max_bat_size));
+       printf("Block size          : %s ", conv(hex, h->block_size));
+       printf("(%s MB)\n", conv(hex, h->block_size >> 20));
+
+       err = vhd_header_decode_parent(vhd, h, &name);
+       printf("Parent name         : %s\n",
+              (err ? "failed to read name" : name));
+       free(name);
+
+       uuid_unparse(h->prt_uuid, uuid);
+       printf("Parent UUID         : %s\n", uuid);
+    
+       vhd_time_to_string(h->prt_ts, time_str);
+       printf("Parent timestamp    : %s\n", time_str);
+
+       cksm = vhd_checksum_header(h);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", h->checksum, cksm,
+               h->checksum == cksm ? "Good!" : "Bad!");
+       printf("\n");
+}
+
+/* String table for hd.type */
+char *hd_type_str[7] = {
+        "None",                    /* 0 */
+        "Reserved (deprecated)",   /* 1 */
+        "Fixed hard disk",         /* 2 */
+        "Dynamic hard disk",       /* 3 */
+        "Differencing hard disk",  /* 4 */
+        "Reserved (deprecated)",   /* 5 */
+        "Reserved (deprecated)"    /* 6 */
+};
+
+static void
+vhd_print_footer(vhd_footer_t *f, int hex)
+{
+       uint64_t  c, h, s;
+       uint32_t  ff_maj, ff_min, cr_maj, cr_min, cksm;
+       char      time_str[26], creator[5], uuid[37], cookie[9];
+
+       printf("VHD Footer Summary:\n-------------------\n");
+
+       snprintf(cookie, 9, "%s", f->cookie);
+       printf("Cookie              : %s\n", cookie);
+
+       printf("Features            : (0x%08x) %s%s\n", f->features,
+               (f->features & HD_TEMPORARY) ? "<TEMP>" : "",
+               (f->features & HD_RESERVED)  ? "<RESV>" : "");
+
+       ff_maj = f->ff_version >> 16;
+       ff_min = f->ff_version & 0xffff;
+       printf("File format version : Major: %d, Minor: %d\n", 
+               ff_maj, ff_min);
+
+       printf("Data offset         : %s\n", conv(hex, f->data_offset));
+
+       vhd_time_to_string(f->timestamp, time_str);
+       printf("Timestamp           : %s\n", time_str);
+
+       memcpy(creator, f->crtr_app, 4);
+       creator[4] = '\0';
+       printf("Creator Application : '%s'\n", creator);
+
+       cr_maj = f->crtr_ver >> 16;
+       cr_min = f->crtr_ver & 0xffff;
+       printf("Creator version     : Major: %d, Minor: %d\n",
+               cr_maj, cr_min);
+
+       printf("Creator OS          : %s\n",
+               ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" :
+                ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : 
+                 "Unknown!")));
+
+       printf("Original disk size  : %s MB ", conv(hex, f->orig_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->orig_size));
+
+       printf("Current disk size   : %s MB ", conv(hex, f->curr_size >> 20));
+       printf("(%s Bytes)\n", conv(hex, f->curr_size));
+
+       c = f->geometry >> 16;
+       h = (f->geometry & 0x0000FF00) >> 8;
+       s = f->geometry & 0x000000FF;
+       printf("Geometry            : Cyl: %s, ", conv(hex, c));
+       printf("Hds: %s, ", conv(hex, h));
+       printf("Sctrs: %s\n", conv(hex, s));
+       printf("                    : = %s MB ", conv(hex, (c * h * s) >> 11));
+       printf("(%s Bytes)\n", conv(hex, c * h * s << 9));
+
+       printf("Disk type           : %s\n", 
+              f->type <= HD_TYPE_MAX ? 
+              hd_type_str[f->type] : "Unknown type!\n");
+
+       cksm = vhd_checksum_footer(f);
+       printf("Checksum            : 0x%x|0x%x (%s)\n", f->checksum, cksm,
+               f->checksum == cksm ? "Good!" : "Bad!");
+
+       uuid_unparse(f->uuid, uuid);
+       printf("UUID                : %s\n", uuid);
+
+       printf("Saved state         : %s\n", f->saved == 0 ? "No" : "Yes");
+       printf("Hidden              : %d\n", f->hidden);
+       printf("\n");
+}
+
+static inline char *
+code_name(uint32_t code)
+{
+       switch(code) {
+       case PLAT_CODE_NONE:
+               return "PLAT_CODE_NONE";
+       case PLAT_CODE_WI2R:
+               return "PLAT_CODE_WI2R";
+       case PLAT_CODE_WI2K:
+               return "PLAT_CODE_WI2K";
+       case PLAT_CODE_W2RU:
+               return "PLAT_CODE_W2RU";
+       case PLAT_CODE_W2KU:
+               return "PLAT_CODE_W2KU";
+       case PLAT_CODE_MAC:
+               return "PLAT_CODE_MAC";
+       case PLAT_CODE_MACX:
+               return "PLAT_CODE_MACX";
+       default:
+               return "UNKOWN";
+       }
+}
+
+static void
+vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc)
+{
+       int err;
+       char *buf;
+
+       err = vhd_parent_locator_read(vhd, loc, &buf);
+       if (err) {
+               printf("failed to read parent name\n");
+               return;
+       }
+
+       printf("       decoded name : %s\n", buf);
+}
+
+static void
+vhd_print_parent_locators(vhd_context_t *vhd, int hex)
+{
+       int i, n;
+       vhd_parent_locator_t *loc;
+
+       printf("VHD Parent Locators:\n--------------------\n");
+
+       n = sizeof(vhd->header.loc) / sizeof(struct prt_loc);
+       for (i = 0; i < n; i++) {
+               loc = &vhd->header.loc[i];
+
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               printf("locator:            : %d\n", i);
+               printf("       code         : %s\n",
+                      code_name(loc->code));
+               printf("       data_space   : %s\n",
+                      conv(hex, loc->data_space));
+               printf("       data_length  : %s\n",
+                      conv(hex, loc->data_len));
+               printf("       data_offset  : %s\n",
+                      conv(hex, loc->data_offset));
+               vhd_print_parent(vhd, loc);
+               printf("\n");
+       }
+}
+
+static void
+vhd_print_batmap_header(vhd_context_t *vhd, vhd_batmap_t *batmap, int hex)
+{
+       uint32_t cksm;
+
+       printf("VHD Batmap Summary:\n-------------------\n");
+       printf("Batmap offset       : %s\n",
+              conv(hex, batmap->header.batmap_offset));
+       printf("Batmap size (secs)  : %s\n",
+              conv(hex, batmap->header.batmap_size));
+       printf("Batmap version      : 0x%08x\n",
+              batmap->header.batmap_version);
+
+       cksm = vhd_checksum_batmap(vhd, batmap);
+       printf("Checksum            : 0x%x|0x%x (%s)\n",
+              batmap->header.checksum, cksm,
+              (batmap->header.checksum == cksm ? "Good!" : "Bad!"));
+       printf("\n");
+}
+
+static inline int
+check_block_range(vhd_context_t *vhd, uint64_t block, int hex)
+{
+       if (block > vhd->header.max_bat_size) {
+               fprintf(stderr, "block %s past end of file\n",
+                       conv(hex, block));
+               return -ERANGE;
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_headers(vhd_context_t *vhd, int hex)
+{
+       int err;
+
+       vhd_print_footer(&vhd->footer, hex);
+
+       if (vhd_type_dynamic(vhd)) {
+               vhd_print_header(vhd, &vhd->header, hex);
+
+               if (vhd->footer.type == HD_TYPE_DIFF)
+                       vhd_print_parent_locators(vhd, hex);
+
+               if (vhd_has_batmap(vhd)) {
+                       err = vhd_get_batmap(vhd);
+                       if (err) {
+                               printf("failed to get batmap header\n");
+                               return err;
+                       }
+
+                       vhd_print_batmap_header(vhd, &vhd->batmap, hex);
+               }
+       }
+
+       return 0;
+}
+
+static int
+vhd_dump_headers(const char *name, int hex)
+{
+       vhd_context_t vhd;
+
+       libvhd_set_log_level(1);
+       memset(&vhd, 0, sizeof(vhd));
+
+       printf("\n%s appears invalid; dumping headers\n\n", name);
+
+       vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY);
+       if (vhd.fd == -1)
+               return -errno;
+
+       vhd.file = strdup(name);
+
+       vhd_read_footer(&vhd, &vhd.footer);
+       vhd_read_header(&vhd, &vhd.header);
+
+       vhd_print_footer(&vhd.footer, hex);
+       vhd_print_header(&vhd, &vhd.header, hex);
+
+       close(vhd.fd);
+       free(vhd.file);
+
+       return 0;
+}
+
+static int
+vhd_print_logical_to_physical(vhd_context_t *vhd,
+                             uint64_t sector, int count, int hex)
+{
+       int i;
+       uint32_t blk, lsec;
+       uint64_t cur, offset;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               fprintf(stderr, "sector %s past end of file\n",
+                       conv(hex, sector + count));
+                       return -ERANGE;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur    = sector + i;
+               blk    = cur / vhd->spb;
+               lsec   = cur % vhd->spb;
+               offset = vhd->bat.bat[blk];
+
+               if (offset != DD_BLK_UNUSED) {
+                       offset += lsec + 1;
+                       offset  = vhd_sectors_to_bytes(offset);
+               }
+
+               printf("logical sector %s: ", conv(hex, cur));
+               printf("block number: %s, ", conv(hex, blk));
+               printf("sector offset: %s, ", conv(hex, lsec));
+               printf("file offset: %s\n", (offset == DD_BLK_UNUSED ?
+                       "not allocated" : conv(hex, offset)));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i;
+       uint64_t cur, offset;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur    = block + i;
+               offset = vhd->bat.bat[cur];
+
+               printf("block: %s: ", conv(hex, cur));
+               printf("offset: %s\n",
+                      (offset == DD_BLK_UNUSED ? "not allocated" :
+                       conv(hex, vhd_sectors_to_bytes(offset))));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_bat_str(vhd_context_t *vhd)
+{
+       int i, err, total_blocks, bitmap_size;
+       char *bitmap;
+       ssize_t n;
+
+       err = 0;
+
+       if (!vhd_type_dynamic(vhd))
+               return -EINVAL;
+
+       total_blocks = vhd->footer.curr_size / vhd->header.block_size;
+       bitmap_size = total_blocks >> 3;
+       if (bitmap_size << 3 < total_blocks)
+               bitmap_size++;
+
+       bitmap = malloc(bitmap_size);
+       if (!bitmap)
+               return -ENOMEM;
+       memset(bitmap, 0, bitmap_size);
+
+       for (i = 0; i < total_blocks; i++) {
+               if (vhd->bat.bat[i] != DD_BLK_UNUSED)
+                       set_bit(bitmap, i);
+       }
+
+       n = write(STDOUT_FILENO, bitmap, bitmap_size);
+       if (n < 0)
+               err = -errno;
+
+       free(bitmap);
+
+       return err;
+}
+
+static int
+vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+       ssize_t n;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_bitmap(vhd, cur, &buf);
+               if (err)
+                       goto out;
+
+               n = write(STDOUT_FILENO, buf, 
vhd_sectors_to_bytes(vhd->bm_secs));
+               if (n < 0) {
+                       err = -errno;
+                       goto out;
+               }
+
+               free(buf);
+       }
+
+       err = 0;
+out:
+       return err;
+}
+
+static int
+vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int i, err, bit;
+       uint32_t blk, bm_blk, sec;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               printf("sector %s past end of file\n", conv(hex, sector));
+               return -ERANGE;
+       }
+
+       bm_blk = -1;
+       buf    = NULL;
+
+       for (i = 0; i < count; i++) {
+               cur = sector + i;
+               blk = cur / vhd->spb;
+               sec = cur % vhd->spb;
+
+               if (blk != bm_blk) {
+                       bm_blk = blk;
+                       free(buf);
+                       buf = NULL;
+
+                       if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+                               err = vhd_read_bitmap(vhd, blk, &buf);
+                               if (err)
+                                       goto out;
+                       }
+               }
+
+               if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+                       bit = 0;
+               else
+                       bit = vhd_bitmap_test(vhd, buf, sec);
+
+               printf("block %s: ", conv(hex, blk));
+               printf("sec: %s: %d\n", conv(hex, sec), bit);
+       }
+
+       err = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_print_bitmap_extents(vhd_context_t *vhd, uint64_t sector, int count,
+                        int hex)
+{
+       char *buf;
+       uint64_t cur;
+       int i, err, bit;
+       uint32_t blk, bm_blk, sec;
+       int64_t s, r;
+
+       if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) {
+               printf("sector %s past end of file\n", conv(hex, sector));
+               return -ERANGE;
+       }
+
+       bm_blk = -1;
+       buf    = NULL;
+       s = -1;
+       r = 0;
+
+       for (i = 0; i < count; i++) {
+               cur = sector + i;
+               blk = cur / vhd->spb;
+               sec = cur % vhd->spb;
+
+               if (blk != bm_blk) {
+                       bm_blk = blk;
+                       free(buf);
+                       buf = NULL;
+
+                       if (vhd->bat.bat[blk] != DD_BLK_UNUSED) {
+                               err = vhd_read_bitmap(vhd, blk, &buf);
+                               if (err)
+                                       goto out;
+                       }
+               }
+
+               if (vhd->bat.bat[blk] == DD_BLK_UNUSED)
+                       bit = 0;
+               else
+                       bit = vhd_bitmap_test(vhd, buf, sec);
+
+               if (bit) {
+                       if (r == 0)
+                               s = cur;
+                       r++;
+               } else {
+                       if (r > 0) {
+                               printf("%s ", conv(hex, s));
+                               printf("%s\n", conv(hex, r));
+                       }
+                       r = 0;
+               }
+       }
+       if (r > 0) {
+               printf("%s ", conv(hex, s));
+               printf("%s\n", conv(hex, r));
+       }
+
+       err = 0;
+ out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_print_batmap(vhd_context_t *vhd)
+{
+       int err, gcc;
+       size_t size;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               printf("failed to read batmap: %d\n", err);
+               return err;
+       }
+
+       size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size);
+       gcc = write(STDOUT_FILENO, vhd->batmap.map, size);
+       if (gcc) {
+               ;
+    }
+
+       return 0;
+}
+
+static int
+vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       int i, err;
+       uint64_t cur;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       err = vhd_get_batmap(vhd);
+       if (err) {
+               fprintf(stderr, "failed to get batmap\n");
+               return err;
+       }
+
+       for (i = 0; i < count; i++) {
+               cur = block + i;
+               fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur),
+                       vhd_batmap_test(vhd, &vhd->batmap, cur));
+       }
+
+       return 0;
+}
+
+static int
+vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex)
+{
+       char *buf;
+       int i, err;
+       uint64_t cur;
+
+       err = 0;
+
+       if (check_block_range(vhd, block + count, hex))
+               return -ERANGE;
+
+       for (i = 0; i < count; i++) {
+               int gcc;
+               cur = block + i;
+
+               if (vhd->bat.bat[cur] == DD_BLK_UNUSED) {
+                       printf("block %s not allocated\n", conv(hex, cur));
+                       continue;
+               }
+
+               err = vhd_read_block(vhd, cur, &buf);
+               if (err)
+                       break;
+
+               gcc = write(STDOUT_FILENO, buf, vhd->header.block_size);
+               if (gcc) {
+                       ;
+        }
+               free(buf);
+       }
+
+       return err;
+}
+
+static int
+vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count,
+        int hex __attribute__((unused)))
+{
+       void *buf;
+       uint64_t cur;
+       int err, max, secs;
+
+       if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size)
+               return -ERANGE;
+
+       max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE);
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, max);
+       if (err)
+               return -err;
+
+       cur = sec;
+       while (count) {
+               int gcc;
+
+               secs = MIN((max >> VHD_SECTOR_SHIFT), count);
+               err  = vhd_io_read(vhd, buf, cur, secs);
+               if (err)
+                       break;
+
+               gcc = write(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs));
+               if (gcc) {
+                       ;
+        }
+
+               cur   += secs;
+               count -= secs;
+       }
+
+       free(buf);
+       return err;
+}
+
+static int
+vhd_read_bytes(vhd_context_t *vhd, uint64_t byte, int count,
+        int hex __attribute__((unused)))
+{
+       void *buf;
+       uint64_t cur;
+       int err, max, bytes;
+
+       if (byte + count > vhd->footer.curr_size)
+               return -ERANGE;
+
+       max = MIN(count, VHD_BLOCK_SIZE);
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, max);
+       if (err)
+               return -err;
+
+       cur = byte;
+       while (count) {
+               ssize_t n;
+
+               bytes = MIN(max, count);
+               err   = vhd_io_read_bytes(vhd, buf, bytes, cur);
+               if (err)
+                       break;
+
+               n = write(STDOUT_FILENO, buf, bytes);
+               if (n < 0) {
+                       err = -errno;
+                       break;
+               }
+
+               cur   += bytes;
+               count -= bytes;
+       }
+
+       free(buf);
+       return err;
+}
+
+int
+vhd_util_read(int argc, char **argv)
+{
+       char *name;
+       vhd_context_t vhd;
+       int c, err, headers, hex, bat_str, cache, flags;
+       uint64_t bat, bitmap, tbitmap, ebitmap, batmap, tbatmap, data, lsec, 
count, read;
+       uint64_t bread;
+
+       err     = 0;
+       hex     = 0;
+       cache   = 0;
+       headers = 0;
+       bat_str = 0;
+       count   = 1;
+       bat     = -1;
+       bitmap  = -1;
+       tbitmap = -1;
+       ebitmap = -1;
+       batmap  = -1;
+       tbatmap = -1;
+       data    = -1;
+       lsec    = -1;
+       read    = -1;
+       bread   = -1;
+       name    = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:pt:b:Bm:i:e:aj:d:c:r:R:xCh")) != -1) {
+               switch(c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       headers = 1;
+                       break;
+               case 'C':
+                       cache = 1;
+                       break;
+               case 'B':
+                       bat_str = 1;
+                       break;
+               case 't':
+                       lsec = strtoul(optarg, NULL, 10);
+                       break;
+               case 'b':
+                       bat = strtoull(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       bitmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'i':
+                       tbitmap = strtoul(optarg, NULL, 10);
+                       break;
+               case 'e':
+                       ebitmap = strtoul(optarg, NULL, 10);
+                       break;
+               case 'a':
+                       batmap = 1;
+                       break;
+               case 'j':
+                       tbatmap = strtoull(optarg, NULL, 10);
+                       break;
+               case 'd':
+                       data = strtoull(optarg, NULL, 10);
+                       break;
+               case 'r':
+                       read = strtoull(optarg, NULL, 10);
+                       break;
+               case 'R':
+                       bread = strtoull(optarg, NULL, 10);
+                       break;
+               case 'c':
+                       count = strtoul(optarg, NULL, 10);
+                       break;
+               case 'x':
+                       hex = 1;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+       if (cache)
+               flags |= VHD_OPEN_CACHED | VHD_OPEN_FAST;
+       err = vhd_open(&vhd, name, flags);
+       if (err) {
+               printf("Failed to open %s: %d\n", name, err);
+               vhd_dump_headers(name, hex);
+               return err;
+       }
+
+       err = vhd_get_bat(&vhd);
+       if (err) {
+               printf("Failed to get bat for %s: %d\n", name, err);
+               goto out;
+       }
+
+       if (headers)
+               vhd_print_headers(&vhd, hex);
+
+       if (lsec != -1) {
+               err = vhd_print_logical_to_physical(&vhd, lsec, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bat != -1) {
+               err = vhd_print_bat(&vhd, bat, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bat_str) {
+               err = vhd_print_bat_str(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (bitmap != -1) {
+               err = vhd_print_bitmap(&vhd, bitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (tbitmap != -1) {
+               err = vhd_test_bitmap(&vhd, tbitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (ebitmap != -1) {
+               err = vhd_print_bitmap_extents(&vhd, ebitmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (batmap != -1) {
+               err = vhd_print_batmap(&vhd);
+               if (err)
+                       goto out;
+       }
+
+       if (tbatmap != -1) {
+               err = vhd_test_batmap(&vhd, tbatmap, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (data != -1) {
+               err = vhd_print_data(&vhd, data, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (read != -1) {
+               err = vhd_read_data(&vhd, read, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       if (bread != -1) {
+               err = vhd_read_bytes(&vhd, bread, count, hex);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+ out:
+       vhd_close(&vhd);
+       return err;
+
+ usage:
+       printf("options:\n"
+              "-h          help\n"
+              "-n          name\n"
+              "-p          print VHD headers\n"
+              "-t sec      translate logical sector to VHD location\n"
+              "-b blk      print bat entry\n"
+              "-B          print entire bat as a bitmap\n"
+              "-m blk      print bitmap\n"
+              "-i sec      test bitmap for logical sector\n"
+              "-e sec      output extent list of allocated logical sectors\n"
+              "-a          print batmap\n"
+              "-j blk      test batmap for block\n"
+              "-d blk      print data\n"
+              "-c num      num units\n"
+              "-r sec      read num sectors at sec\n"
+              "-R byte     read num bytes at byte\n"
+              "-x          print in hex\n");
+       return EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-repair.c 
b/tools/blktap3/vhd/lib/vhd-util-repair.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-repair.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_repair(int argc, char **argv)
+{
+       char *name;
+       int err, c;
+       vhd_context_t vhd;
+
+       name = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || optind != argc)
+               goto usage;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       err = vhd_write_footer(&vhd, &vhd.footer);
+       if (err)
+               printf("error writing footer: %d\n", err);
+
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-resize.c 
b/tools/blktap3/vhd/lib/vhd-util-resize.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-resize.c
@@ -0,0 +1,1200 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+
+#include "libvhd-journal.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+#define EPRINTF(_f, _a...)                                     \
+       do {                                                    \
+               syslog(LOG_INFO, "%s: " _f, __func__, ##_a);    \
+               DFPRINTF(_f, _a);                               \
+       } while (0)
+
+typedef struct vhd_block {
+       uint32_t block;
+       uint32_t offset;
+} vhd_block_t;
+
+TEST_FAIL_EXTERN_VARS;
+
+static inline uint32_t
+secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs)
+{
+       return secs / vhd->spb;
+}
+
+static uint32_t
+secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs)
+{
+       uint32_t blocks;
+
+       blocks = secs / vhd->spb;
+       if (secs % vhd->spb)
+               blocks++;
+
+       return blocks;
+}
+
+static int
+vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       uint64_t new_eof;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+
+       new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs);
+       if (new_eof <= sizeof(vhd_footer_t))
+               return -EINVAL;
+
+       err = ftruncate(vhd->fd, new_eof);
+       if (err)
+               return errno;
+
+       vhd->footer.curr_size = new_eof;
+       return vhd_write_footer(vhd, &vhd->footer);
+}
+
+static int
+vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size)
+{
+       int err;
+       char *buf;
+       vhd_context_t *vhd;
+       uint64_t bytes, map;
+
+       vhd = &journal->vhd;
+       map = MIN(size, VHD_BLOCK_SIZE);
+
+       err = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               return err;
+
+       buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+       if (buf == MAP_FAILED)
+               return -errno;
+
+       do {
+               bytes = MIN(size, map);
+
+               err = vhd_write(vhd, buf, bytes);
+               if (err)
+                       break;
+
+               size -= bytes;
+       } while (size);
+
+       munmap(buf, map);
+
+       return err;
+}
+
+static int
+vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t size, eof, new_eof;
+
+       size = vhd_sectors_to_bytes(secs);
+       vhd  = &journal->vhd;
+
+       err = vhd_seek(vhd, 0, SEEK_END);
+       if (err)
+               goto out;
+
+       eof = vhd_position(vhd);
+       if (eof == (off64_t)-1) {
+               err = -errno;
+               goto out;
+       }
+
+       err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size);
+       if (err)
+               goto out;
+
+       new_eof = eof + size;
+       err = vhd_seek(vhd, new_eof, SEEK_SET);
+       if (err)
+               goto out;
+
+       vhd->footer.curr_size += size;
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       return err;
+}
+
+static int
+vhd_fixed_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+       else if (cur_secs > new_secs)
+               err = vhd_fixed_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_fixed_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static inline void
+swap(vhd_block_t *list, int a, int b)
+{
+       vhd_block_t tmp;
+
+       tmp     = list[a];
+       list[a] = list[b];
+       list[b] = tmp;
+}
+
+static int
+partition(vhd_block_t *list, int left, int right, int pidx)
+{
+       int i, sidx;
+       long long pval;
+
+       sidx = left;
+       pval = list[pidx].offset;
+       swap(list, pidx, right);
+
+       for (i = left; i < right; i++)
+               if (list[i].offset >= pval) {
+                       swap(list, sidx, i);
+                       ++sidx;
+               }
+
+       swap(list, right, sidx);
+       return sidx;
+}
+
+static void
+quicksort(vhd_block_t *list, int left, int right)
+{
+       int pidx, new_pidx;
+
+       if (right < left)
+               return;
+
+       pidx     = left;
+       new_pidx = partition(list, left, right, pidx);
+       quicksort(list, left, new_pidx - 1);
+       quicksort(list, new_pidx + 1, right);
+}
+
+static int
+vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset)
+{
+       int err;
+       char *buf;
+       size_t size;
+       vhd_context_t *vhd;
+       off64_t off, src_off;
+
+       buf     = NULL;
+       vhd     = &journal->vhd;
+       off     = offset;
+       size    = vhd_sectors_to_bytes(vhd->bm_secs);
+       src_off = vhd->bat.bat[src];
+
+       if (src_off == DD_BLK_UNUSED)
+               return -EINVAL;
+       src_off = vhd_sectors_to_bytes(src_off);
+
+       err  = vhd_journal_add_block(journal, src,
+                                    VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               goto out;
+
+       err  = vhd_read_bitmap(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       free(buf);
+       buf   = NULL;
+       off  += size;
+       size  = vhd_sectors_to_bytes(vhd->spb);
+
+       err  = vhd_read_block(vhd, src, &buf);
+       if (err)
+               goto out;
+
+       err  = vhd_seek(vhd, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err  = vhd_write(vhd, buf, size);
+       if (err)
+               goto out;
+
+       vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT;
+
+       err = vhd_write_zeros(journal, src_off,
+                             vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb));
+
+out:
+       free(buf);
+       return err;
+}
+
+static int
+vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest)
+{
+       int err;
+       off64_t off;
+       vhd_context_t *vhd;
+
+       vhd = &journal->vhd;
+       off = vhd_sectors_to_bytes(vhd->bat.bat[dest]);
+
+       err = vhd_journal_add_block(journal, dest,
+                                   VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA);
+       if (err)
+               return err;
+
+       err = vhd_move_block(journal, src, off);
+       if (err)
+               return err;
+
+       vhd->bat.bat[dest] = DD_BLK_UNUSED;
+
+       return 0;
+}
+
+/*
+ * remove a list of blocks from the vhd file
+ * if a block to be removed:
+ *   - resides at the end of the file: simply clear its bat entry
+ *   - resides elsewhere: move the last block in the file into its position
+ *                        and update the bat to reflect this
+ */
+static int
+vhd_defrag_shrink(vhd_journal_t *journal,
+                 vhd_block_t *original_free_list, int free_cnt)
+{
+       vhd_context_t *vhd;
+       int i, j, free_idx, err;
+       vhd_block_t *blocks, *free_list;
+
+       err       = 0;
+       blocks    = NULL;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t));
+       if (!blocks) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       free_list = malloc(free_cnt * sizeof(vhd_block_t));
+       if (!free_list) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blocks[i].block  = i;
+               blocks[i].offset = vhd->bat.bat[i];
+       }
+
+       memcpy(free_list, original_free_list,
+              free_cnt * sizeof(vhd_block_t));
+
+       /* sort both the to-free list and the bat list
+        * in order of descending file offset */
+       quicksort(free_list, 0, free_cnt - 1);
+       quicksort(blocks, 0, vhd->bat.entries - 1);
+
+       for (i = 0, free_idx = 0;
+            i < vhd->bat.entries && free_idx < free_cnt; i++) {
+               vhd_block_t *b = blocks + i;
+
+               if (b->offset == DD_BLK_UNUSED)
+                       continue;
+
+               for (j = free_idx; j < free_cnt; j++)
+                       if (b->block == free_list[j].block) {
+                               /* the last block in the file is in the list of
+                                * blocks to remove; no need to shuffle the
+                                * data -- just clear the bat entry */
+                               vhd->bat.bat[free_list[j].block] = 
DD_BLK_UNUSED;
+                               free_idx++;
+                               continue;
+                       }
+
+               err = vhd_clobber_block(journal, b->block,
+                                       free_list[free_idx++].block);
+               if (err)
+                       goto out;
+       }
+
+       /* clear any bat entries for blocks we did not shuffle */
+       for (i = free_idx; i < free_cnt; i++)
+               vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED;
+
+out:
+       free(blocks);
+       free(free_list);
+
+       return err;
+}
+
+static int
+vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries)
+{
+       int i, err;
+       vhd_context_t *vhd;
+       off64_t orig_map_off, new_map_off;
+       uint32_t orig_entries, new_entries;
+
+       vhd          = &journal->vhd;
+       orig_entries = vhd->header.max_bat_size;
+       new_entries  = orig_entries - entries;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_batmap_header_offset(vhd, &orig_map_off);
+               if (err)
+                       return err;
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       /* update bat -- we don't reclaim space, just clear entries */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd->bat.bat[i] = 0;
+
+       err = vhd_write_bat(vhd, &vhd->bat);
+       if (err)
+               return err;
+
+       /* update this after write_bat so the end of the bat is zeored */
+       vhd->bat.entries = new_entries;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* zero out old batmap header if new header has moved */
+       err = vhd_batmap_header_offset(vhd, &new_map_off);
+       if (err)
+               return err;
+
+       if (orig_map_off != new_map_off) {
+               size_t size;
+
+               size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr));
+
+               err = vhd_write_zeros(journal, orig_map_off, size);
+               if (err)
+                       return err;
+       }
+
+       /* update batmap -- clear entries for freed blocks */
+       for (i = new_entries; i < orig_entries; i++)
+               vhd_batmap_clear(vhd, &vhd->batmap, i);
+
+       err = vhd_write_batmap(vhd, &vhd->batmap);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs)
+{
+       off64_t eof;
+       uint32_t blocks;
+       vhd_context_t *vhd;
+       int i, j, err, free_cnt;
+       struct vhd_block *free_list;
+
+       printf("dynamic shrink not fully implemented\n");
+       return -ENOSYS;
+
+       eof       = 0;
+       free_cnt  = 0;
+       free_list = NULL;
+       vhd       = &journal->vhd;
+
+       blocks    = secs_to_blocks_down(vhd, secs);
+       if (blocks == 0)
+               return 0;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       free_list = malloc(blocks * sizeof(struct vhd_block));
+       if (!free_list)
+               return -ENOMEM;
+
+       for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) {
+               uint32_t blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       free_list[free_cnt].block  = i;
+                       free_list[free_cnt].offset = blk;
+                       free_cnt++;
+               }
+       }
+
+       if (free_cnt) {
+               err = vhd_defrag_shrink(journal, free_list, free_cnt);
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_clear_bat_entries(journal, blocks);
+       if (err)
+               goto out;
+
+       /* remove data beyond footer */
+       err = vhd_end_of_data(vhd, &eof);
+       if (err)
+               goto out;
+
+       err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t));
+       if (err) {
+               err = -errno;
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       free(free_list);
+       return err;
+}
+
+static inline void
+vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block)
+{
+       int i;
+       uint32_t blk;
+
+       memset(block, 0, sizeof(vhd_block_t));
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       if (!block->offset || blk < block->offset) {
+                               block->block  = i;
+                               block->offset = blk;
+                       }
+               }
+       }
+}
+
+static inline uint32_t
+vhd_next_block_offset(vhd_context_t *vhd)
+{
+       int i;
+       uint32_t blk, end, next;
+
+       next = 0;
+
+       for (i = 0; i < vhd->bat.entries; i++) {
+               blk = vhd->bat.bat[i];
+
+               if (blk != DD_BLK_UNUSED) {
+                       end  = blk + vhd->spb + vhd->bm_secs;
+                       next = MAX(next, end);
+               }
+       }
+
+       return next;
+}
+
+static inline int
+in_range(off64_t off, off64_t start, off64_t size)
+{
+       return (start < off && start + size > off);
+}
+
+#define SKIP_HEADER 0x01
+#define SKIP_BAT    0x02
+#define SKIP_BATMAP 0x04
+#define SKIP_PLOC   0x08
+#define SKIP_DATA   0x10
+
+static inline int
+skip_check(int mode, int type)
+{
+       return mode & type;
+}
+
+static int
+vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode)
+{
+       int i, n;
+       char *msg;
+       size_t size;
+       vhd_block_t fb;
+       vhd_parent_locator_t *loc;
+
+       msg = NULL;
+
+       if (!vhd_type_dynamic(vhd))
+               return 0;
+
+       if (off < VHD_SECTOR_SIZE) {
+               msg = "backup footer";
+               goto fail;
+       }
+
+       if (!skip_check(mode, SKIP_HEADER))
+               if (in_range(off,
+                            vhd->footer.data_offset, sizeof(vhd_header_t))) {
+                       msg = "header";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BAT))
+               if (in_range(off, vhd->header.table_offset,
+                            vhd_bytes_padded(vhd->header.max_bat_size *
+                                             sizeof(uint32_t)))) {
+                       msg = "bat";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_BATMAP))
+               if (vhd_has_batmap(vhd) &&
+                   in_range(off, vhd->batmap.header.batmap_offset,
+                            vhd_bytes_padded(vhd->batmap.header.batmap_size))) 
{
+                       msg = "batmap";
+                       goto fail;
+               }
+
+       if (!skip_check(mode, SKIP_PLOC)) {
+               n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+               for (i = 0; i < n; i++) {
+                       loc = vhd->header.loc + i;
+                       if (loc->code == PLAT_CODE_NONE)
+                               continue;
+
+                       size = vhd_parent_locator_size(loc);
+                       if (in_range(off, loc->data_offset, size)) {
+                               msg = "parent locator";
+                               goto fail;
+                       }
+               }
+       }
+
+       if (!skip_check(mode, SKIP_DATA)) {
+               vhd_first_data_block(vhd, &fb);
+               if (fb.offset && in_range(off,
+                                         vhd_sectors_to_bytes(fb.offset),
+                                         VHD_BLOCK_SIZE)) {
+                       msg = "data block";
+                       goto fail;
+               }
+       }
+
+       return 0;
+
+fail:
+       EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg);
+       return -EINVAL;
+}
+
+/*
+ * take any metadata after the bat (@eob) and shift it
+ */
+static int
+vhd_shift_metadata(vhd_journal_t *journal, off64_t eob,
+                  size_t bat_needed, size_t map_needed)
+{
+       int i, n, err;
+       vhd_context_t *vhd;
+       size_t size_needed;
+       void *buf;
+       char **locators;
+       vhd_parent_locator_t *loc;
+
+       vhd         = &journal->vhd;
+       size_needed = bat_needed + map_needed;
+
+       n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t);
+
+       locators = calloc(n, sizeof(char *));
+       if (!locators)
+               return -ENOMEM;
+
+       for (i = 0; i < n; i++) {
+               size_t size;
+
+               loc = vhd->header.loc + i;
+               if (loc->code == PLAT_CODE_NONE)
+                       continue;
+
+               if (loc->data_offset < eob)
+                       continue;
+
+               size = vhd_parent_locator_size(loc);
+               err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+               if (err) {
+                       err = -err;
+                       buf = NULL;
+                       goto out;
+               }
+
+               err  = vhd_seek(vhd, loc->data_offset, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_read(vhd, buf, size);
+               if (err)
+                       goto out;
+
+               locators[i] = buf;
+       }
+
+       for (i = 0; i < n; i++) {
+               off64_t off;
+               size_t size;
+
+               if (!locators[i])
+                       continue;
+
+               loc  = vhd->header.loc + i;
+               off  = loc->data_offset + size_needed;
+               size = vhd_parent_locator_size(loc);
+
+               if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) {
+                       EPRINTF("%s: shifting locator %d would clobber data\n",
+                               vhd->file, i);
+                       return -EINVAL;
+               }
+
+               err  = vhd_seek(vhd, off, SEEK_SET);
+               if (err)
+                       goto out;
+
+               err  = vhd_write(vhd, locators[i], size);
+               if (err)
+                       goto out;
+
+               free(locators[i]);
+               locators[i]      = NULL;
+               loc->data_offset = off;
+
+               /* write the new header after writing the new bat */
+       }
+
+       if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) {
+               vhd->batmap.header.batmap_offset += bat_needed;
+
+               /* write the new batmap after writing the new bat */
+       }
+
+       err = 0;
+
+out:
+       for (i = 0; i < n; i++)
+               free(locators[i]);
+       free(locators);
+
+       return err;
+}
+
+static int
+vhd_add_bat_entries(vhd_journal_t *journal, int entries)
+{
+       int i, err;
+       off64_t off;
+       vhd_bat_t new_bat;
+       vhd_context_t *vhd;
+       uint32_t new_entries;
+       vhd_batmap_t new_batmap;
+       uint64_t bat_size, new_bat_size, map_size, new_map_size;
+       void *bat, *map;
+
+       vhd          = &journal->vhd;
+       new_entries  = vhd->header.max_bat_size + entries;
+
+       bat_size     = vhd_bytes_padded(vhd->header.max_bat_size *
+                                       sizeof(uint32_t));
+       new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t));
+
+       map_size     = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3);
+       new_map_size = vhd_bytes_padded((new_entries + 7) >> 3);
+
+       off = vhd->header.table_offset + new_bat_size;
+       if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) {
+               EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes "
+                       "at 0x%08"PRIx64" would clobber data\n", 
+                       vhd->file, new_bat_size, vhd->header.table_offset);
+               return -EINVAL;
+       }
+
+       if (vhd_has_batmap(vhd)) {
+               off = vhd->batmap.header.batmap_offset + new_map_size;
+               if (vhd_check_for_clobber(vhd, off, 0)) {
+                       EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes"
+                               " at 0x%08"PRIx64" would clobber data\n", 
vhd->file,
+                               new_map_size, vhd->batmap.header.batmap_offset);
+                       return -EINVAL;
+               }
+       }
+
+       /* update header */
+       vhd->header.max_bat_size = new_entries;
+       err = vhd_write_header(vhd, &vhd->header);
+       if (err)
+               return err;
+
+       /* allocate new bat */
+       err = posix_memalign(&bat, VHD_SECTOR_SIZE, new_bat_size);
+       if (err)
+               return -err;
+
+       new_bat.bat     = bat;
+       new_bat.spb     = vhd->bat.spb;
+       new_bat.entries = new_entries;
+       memcpy(new_bat.bat, vhd->bat.bat, bat_size);
+       for (i = vhd->bat.entries; i < new_entries; i++)
+               new_bat.bat[i] = DD_BLK_UNUSED;
+
+       /* write new bat */
+       err = vhd_write_bat(vhd, &new_bat);
+       if (err) {
+               free(new_bat.bat);
+               return err;
+       }
+
+       /* update in-memory bat */
+       free(vhd->bat.bat);
+       vhd->bat = new_bat;
+
+       if (!vhd_has_batmap(vhd))
+               return 0;
+
+       /* allocate new batmap */
+       err = posix_memalign(&map, VHD_SECTOR_SIZE, new_map_size);
+       if (err)
+               return err;
+
+       new_batmap.map    = map;
+       new_batmap.header = vhd->batmap.header;
+       new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size);
+       memcpy(new_batmap.map, vhd->batmap.map, map_size);
+       memset(new_batmap.map + map_size, 0, new_map_size - map_size);
+
+       /* write new batmap */
+       err = vhd_write_batmap(vhd, &new_batmap);
+       if (err) {
+               free(new_batmap.map);
+               return err;
+       }
+
+       /* update in-memory batmap */
+       free(vhd->batmap.map);
+       vhd->batmap = new_batmap;
+
+       /* update footer */
+       vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size;
+       vhd->footer.geometry  = vhd_chs(vhd->footer.curr_size);
+       vhd->footer.checksum  = vhd_checksum_footer(&vhd->footer);
+       err = vhd_write_footer(vhd, &vhd->footer);
+       if (err)
+               return err;
+
+       return 0;
+}
+
+static int
+vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs)
+{
+       int err;
+       off64_t eob, eom;
+       vhd_context_t *vhd;
+       vhd_block_t first_block;
+       uint64_t blocks, size_needed;
+       uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs;
+       uint64_t map_needed, map_size, map_avail, map_bytes, map_secs;
+
+       vhd         = &journal->vhd;
+
+       size_needed = 0;
+       bat_needed  = 0;
+       map_needed  = 0;
+
+       /* number of vhd blocks to add */
+       blocks      = secs_to_blocks_up(vhd, secs);
+
+       /* size in bytes needed for new bat entries */
+       bat_needed  = blocks * sizeof(uint32_t);
+       map_needed  = (blocks >> 3) + 1;
+
+       /* available bytes in current bat */
+       bat_bytes   = vhd->header.max_bat_size * sizeof(uint32_t);
+       bat_secs    = secs_round_up_no_zero(bat_bytes);
+       bat_size    = vhd_sectors_to_bytes(bat_secs);
+       bat_avail   = bat_size - bat_bytes;
+
+       if (vhd_has_batmap(vhd)) {
+               /* avaliable bytes in current batmap */
+               map_bytes   = (vhd->header.max_bat_size + 7) >> 3;
+               map_secs    = vhd->batmap.header.batmap_size;
+               map_size    = vhd_sectors_to_bytes(map_secs);
+               map_avail   = map_size - map_bytes;
+       } else {
+               map_needed  = 0;
+               map_avail   = 0;
+       }
+
+       /* we have enough space already; just extend the bat */
+       if (bat_needed <= bat_avail && map_needed <= map_avail)
+               goto add_entries;
+
+       /* we need to add new sectors to the bat */
+       if (bat_needed > bat_avail) {
+               bat_needed -= bat_avail;
+               bat_needed  = vhd_bytes_padded(bat_needed);
+       } else
+               bat_needed  = 0;
+
+       /* we need to add new sectors to the batmap */
+       if (map_needed > map_avail) {
+               map_needed -= map_avail;
+               map_needed  = vhd_bytes_padded(map_needed);
+       } else
+               map_needed  = 0;
+
+       /* how many additional bytes do we need? */
+       size_needed = bat_needed + map_needed;
+
+       /* calculate space between end of headers and beginning of data */
+       err = vhd_end_of_headers(vhd, &eom);
+       if (err)
+               return err;
+
+       eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs);
+       vhd_first_data_block(vhd, &first_block);
+
+       /* no blocks allocated; just shift post-bat metadata */
+       if (!first_block.offset)
+               goto shift_metadata;
+
+       /* 
+        * not enough space -- 
+        * move vhd data blocks to the end of the file to make room 
+        */
+       do {
+               off64_t new_off, bm_size, gap_size;
+
+               new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd));
+
+               /* data region of segment should begin on page boundary */
+               bm_size = vhd_sectors_to_bytes(vhd->bm_secs);
+               if ((new_off + bm_size) % 4096) {
+                       gap_size = 4096 - ((new_off + bm_size) % 4096);
+
+                       err = vhd_write_zeros(journal, new_off, gap_size);
+                       if (err)
+                               return err;
+
+                       new_off += gap_size;
+               }
+
+               err = vhd_move_block(journal, first_block.block, new_off);
+               if (err)
+                       return err;
+
+               vhd_first_data_block(vhd, &first_block);
+
+       } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset));
+
+       TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED);
+
+shift_metadata:
+       /* shift any metadata after the bat to make room for new bat sectors */
+       err = vhd_shift_metadata(journal, eob, bat_needed, map_needed);
+       if (err)
+               return err;
+
+       TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED);
+
+add_entries:
+       return vhd_add_bat_entries(journal, blocks);
+}
+
+static int
+vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size)
+{
+       int err;
+       vhd_context_t *vhd;
+       uint64_t cur_secs, new_secs;
+
+       vhd      = &journal->vhd;
+       cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT;
+       new_secs = size << (20 - VHD_SECTOR_SHIFT);
+
+       if (cur_secs == new_secs)
+               return 0;
+
+       err = vhd_get_header(vhd);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(vhd);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(vhd)) {
+               err = vhd_get_batmap(vhd);
+               if (err)
+                       return err;
+       }
+
+       if (cur_secs > new_secs)
+               err = vhd_dynamic_shrink(journal, cur_secs - new_secs);
+       else
+               err = vhd_dynamic_grow(journal, new_secs - cur_secs);
+
+       return err;
+}
+
+static int
+vhd_util_resize_check_creator(const char *name)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (!vhd_creator_tapdisk(&vhd)) {
+               printf("%s not created by xen; resize not supported\n", name);
+               err = -EINVAL;
+       }
+
+       vhd_close(&vhd);
+       return err;
+}
+
+static int
+vhd_dynamic_grow_fast(const char *name, uint64_t bytes)
+{
+       vhd_context_t vhd;
+       uint64_t blks, size;
+       int err;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err)
+               return err;
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto done;
+
+       if (vhd_has_batmap(&vhd)) {
+               err = vhd_get_batmap(&vhd);
+               if (err)
+                       goto done;
+       }
+
+       blks   = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+       size   = blks << VHD_BLOCK_SHIFT;
+       if (size < vhd.footer.curr_size) {
+               printf("%s: size (%"PRIu64") < curr size (%"PRIu64")\n", 
+                      name, size, vhd.footer.curr_size);
+               err = -EINVAL;
+               goto done;
+       }
+       if (size == vhd.footer.curr_size)
+               goto done;
+
+       err = vhd_set_virt_size(&vhd, size);
+
+done:
+       vhd_close(&vhd);
+       return err;
+}
+
+int
+vhd_util_resize(int argc, char **argv)
+{
+       char *name, *jname;
+       uint64_t size;
+       int fast, c, err, jerr;
+       vhd_journal_t journal;
+       vhd_context_t *vhd;
+
+       err   = -EINVAL;
+       size  = 0;
+       name  = NULL;
+       jname = NULL;
+       fast  = 0;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:s:j:fh")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 'f':
+                       fast = 1;
+                       break;
+               case 's':
+                       err  = 0;
+                       size = strtoull(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (err || !name || (!jname && !fast) || argc != optind)
+               goto usage;
+
+       if (jname && fast)
+               goto usage;
+
+       err = vhd_util_resize_check_creator(name);
+       if (err)
+               return err;
+
+       libvhd_set_log_level(1);
+
+       if (fast)
+               return vhd_dynamic_grow_fast(name, size << 20);
+
+       err = vhd_journal_create(&journal, name, jname);
+       if (err) {
+               printf("creating journal failed: %d\n", err);
+               return err;
+       }
+
+       vhd = &journal.vhd;
+
+       err = vhd_get_footer(vhd);
+       if (err)
+               goto out;
+
+       TEST_FAIL_AT(FAIL_RESIZE_BEGIN);
+
+       if (vhd_type_dynamic(vhd))
+               err = vhd_dynamic_resize(&journal, size);
+       else
+               err = vhd_fixed_resize(&journal, size);
+
+       TEST_FAIL_AT(FAIL_RESIZE_END);
+
+out:
+       if (err) {
+               printf("resize failed: %d\n", err);
+               jerr = vhd_journal_revert(&journal);
+       } else
+               jerr = vhd_journal_commit(&journal);
+
+       if (jerr) {
+               printf("closing journal failed: %d\n", jerr);
+               vhd_journal_close(&journal);
+       } else
+               vhd_journal_remove(&journal);
+
+       return (err ? : jerr);
+
+usage:
+       printf("options: <-n name> <-s size (in MB)> (<-j journal>|<-f fast>) "
+                       "[-h help]\n\n"
+                       "The resize operation can only be performed offline "
+                       "and must be journaled because resizing the metadata "
+                       "might require moving data blocks. However, if a "
+                       "VHD was created with -S <msize> option (during "
+                       "vhd-util create/snapshot), which preallocates the "
+                       "metadata for growing the VHD up to size <msize>, then "
+                       "resizing such a VHD up to <msize> can be performed "
+                       "online without journaling (-f option).\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-revert.c 
b/tools/blktap3/vhd/lib/vhd-util-revert.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-revert.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+//#include <fcntl.h>
+#include <stdio.h>
+//#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+int
+vhd_util_revert(int argc, char **argv)
+{
+       char *name, *jname;
+       vhd_journal_t journal;
+       int c, err;
+
+       name  = NULL;
+       jname = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:j:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'j':
+                       jname = optarg;
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !jname || argc != optind)
+               goto usage;
+
+       libvhd_set_log_level(1);
+       err = vhd_journal_open(&journal, name, jname);
+       if (err) {
+               printf("opening journal failed: %d\n", err);
+               return err;
+       }
+
+       err = vhd_journal_revert(&journal);
+       if (err) {
+               printf("reverting journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = vhd_journal_remove(&journal);
+       if (err) {
+               printf("removing journal failed: %d\n", err);
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       return 0;
+
+usage:
+       printf("options: <-n name> <-j journal> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-scan.c 
b/tools/blktap3/vhd/lib/vhd-util-scan.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-scan.c
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <glob.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fnmatch.h>
+#include <limits.h>
+#include <libgen.h>
+#include <syslog.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "libvhd.h"
+#include "lvm-util.h"
+
+#define VHD_SCAN_FAST        0x01
+#define VHD_SCAN_PRETTY      0x02
+#define VHD_SCAN_VOLUME      0x04
+#define VHD_SCAN_NOFAIL      0x08
+#define VHD_SCAN_VERBOSE     0x10
+#define VHD_SCAN_PARENTS     0x20
+#define VHD_SCAN_MARKERS     0x40
+
+#define VHD_TYPE_RAW_FILE    0x01
+#define VHD_TYPE_VHD_FILE    0x02
+#define VHD_TYPE_RAW_VOLUME  0x04
+#define VHD_TYPE_VHD_VOLUME  0x08
+
+#define EPRINTF(_f, _a...)                                     \
+       do {                                                    \
+               syslog(LOG_INFO, "%s: " _f, __func__, ##_a);    \
+       } while (0)
+
+static inline int
+target_volume(uint8_t type)
+{
+       return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME);
+}
+
+static inline int
+target_vhd(uint8_t type)
+{
+       return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME);
+}
+
+struct target {
+       char                 name[VHD_MAX_NAME_LEN];
+       char                 device[VHD_MAX_NAME_LEN];
+       uint64_t             size;
+       uint64_t             start;
+       uint64_t             end;
+       uint8_t              type;
+};
+
+struct iterator {
+       int                  cur;
+       int                  cur_size;
+       int                  max_size;
+       struct target       *targets;
+};
+
+TAILQ_HEAD(tqh_vhd_image, vhd_image);
+
+struct vhd_image {
+       char                *name;
+       char                *parent;
+       uint64_t             capacity;
+       off64_t              size;
+       uint8_t              hidden;
+       char                 marker;
+       int                  error;
+       char                *message;
+
+       struct target       *target;
+
+       TAILQ_ENTRY(vhd_image)     sibling;
+       struct tqh_vhd_image     children;
+       struct vhd_image    *parent_image;
+};
+
+struct vhd_scan {
+       int                  cur;
+       int                  size;
+
+       int                  lists_cur;
+       int                  lists_size;
+
+       struct vhd_image   **images;
+       struct vhd_image   **lists;
+};
+
+static int flags;
+static struct vg vg;
+static struct vhd_scan scan;
+
+static int
+vhd_util_scan_pretty_allocate_list(int cnt)
+{
+       int i;
+
+       memset(&scan, 0, sizeof(scan));
+
+       scan.lists_cur  = 1;
+       scan.lists_size = 10;
+
+       scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *));
+       if (!scan.lists)
+               goto fail;
+
+       scan.lists[0] = calloc(cnt, sizeof(struct vhd_image));
+       if (!scan.lists[0])
+               goto fail;
+
+       scan.images = calloc(cnt, sizeof(struct vhd_image *));
+       if (!scan.images)
+               goto fail;
+
+       for (i = 0; i < cnt; i++)
+               scan.images[i] = scan.lists[0] + i;
+
+       scan.cur  = 0;
+       scan.size = cnt;
+
+       return 0;
+
+fail:
+       if (scan.lists) {
+               free(scan.lists[0]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+       return -ENOMEM;
+}
+
+static void
+vhd_util_scan_pretty_free_list(void)
+{
+       int i;
+
+       if (scan.lists) {
+               for (i = 0; i < scan.lists_cur; i++)
+                       free(scan.lists[i]);
+               free(scan.lists);
+       }
+
+       free(scan.images);
+       memset(&scan, 0, sizeof(scan));
+}
+
+static int
+vhd_util_scan_pretty_add_image(struct vhd_image *image)
+{
+       int i;
+       struct vhd_image *img;
+
+       for (i = 0; i < scan.cur; i++) {
+               img = scan.images[i];
+               if (!strcmp(img->name, image->name))
+                       return 0;
+       }
+
+       if (scan.cur >= scan.size) {
+               struct vhd_image *new, **list;
+
+               if (scan.lists_cur >= scan.lists_size) {
+                       list = realloc(scan.lists, scan.lists_size * 2 *
+                                      sizeof(struct vhd_image *));
+                       if (!list)
+                               return -ENOMEM;
+
+                       scan.lists_size *= 2;
+                       scan.lists       = list;
+               }
+
+               new = calloc(scan.size, sizeof(struct vhd_image));
+               if (!new)
+                       return -ENOMEM;
+
+               scan.lists[scan.lists_cur++] = new;
+               scan.size *= 2;
+
+               list = realloc(scan.images, scan.size *
+                              sizeof(struct vhd_image *));
+               if (!list)
+                       return -ENOMEM;
+
+               scan.images = list;
+               for (i = 0; i + scan.cur < scan.size; i++)
+                       scan.images[i + scan.cur] = new + i;
+       }
+
+       img = scan.images[scan.cur];
+       TAILQ_INIT(&img->children);
+
+       img->capacity = image->capacity;
+       img->size     = image->size;
+       img->hidden   = image->hidden;
+       img->marker   = image->marker;
+       img->error    = image->error;
+       img->message  = image->message;
+
+       img->name = strdup(image->name);
+       if (!img->name)
+               goto fail;
+
+       if (image->parent) {
+               img->parent = strdup(image->parent);
+               if (!img->parent)
+                       goto fail;
+       }
+
+       scan.cur++;
+       return 0;
+
+fail:
+       free(img->name);
+       free(img->parent);
+       memset(img, 0, sizeof(*img));
+       return -ENOMEM;
+}
+
+static int
+vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs)
+{
+       struct vhd_image *l, *r;
+
+       l = *(struct vhd_image **)lhs;
+       r = *(struct vhd_image **)rhs;
+
+       return strcmp(l->name, r->name);
+}
+
+static void
+vhd_util_scan_print_image_indent(struct vhd_image *image, int tab)
+{
+       char *pad, *name, *pmsg, *parent;
+
+       pad    = (tab ? " " : "");
+       name   = image->name;
+       parent = (image->parent ? : "none");
+
+       if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image)
+               pmsg = " (not found in scan)";
+       else
+               pmsg = "";
+
+       if (!(flags & VHD_SCAN_VERBOSE)) {
+               name = basename(image->name);
+               if (image->parent)
+                       parent = basename(image->parent);
+       }
+
+       if (image->error)
+               printf("%*svhd=%s scan-error=%d error-message='%s'\n",
+                      tab, pad, image->name, image->error, image->message);
+       else if (!(flags & VHD_SCAN_MARKERS))
+               printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+                      "parent=%s%s\n", tab, pad, name, image->capacity,
+                      image->size, image->hidden, parent, pmsg);
+       else
+               printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u "
+                      "marker=%u parent=%s%s\n", tab, pad, name,
+                      image->capacity, image->size, image->hidden,
+                      (uint8_t)image->marker, parent, pmsg);
+}
+
+static void
+vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth)
+{
+       struct vhd_image *img, *tmp;
+
+       vhd_util_scan_print_image_indent(image, depth * 3);
+
+       TAILQ_FOREACH_SAFE(img, &image->children, sibling, tmp)
+               if (!img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       TAILQ_FOREACH_SAFE(img, &image->children, sibling, tmp)
+               if (img->hidden)
+                       vhd_util_scan_pretty_print_tree(img, depth + 1);
+
+       free(image->name);
+       free(image->parent);
+
+       image->name   = NULL;
+       image->parent = NULL;
+}
+
+static void
+vhd_util_scan_pretty_print_images(void)
+{
+       int i;
+       struct vhd_image *image, **parentp, *parent, *keyp, key;
+
+       qsort(scan.images, scan.cur, sizeof(scan.images[0]),
+             vhd_util_scan_pretty_image_compare);
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->parent) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               memset(&key, 0, sizeof(key));
+               key.name = image->parent;
+               keyp     = &key;
+
+               parentp  = bsearch(&keyp, scan.images, scan.cur,
+                                  sizeof(scan.images[0]),
+                                  vhd_util_scan_pretty_image_compare);
+               if (!parentp) {
+                       image->parent_image = NULL;
+                       continue;
+               }
+
+               parent = *parentp;
+               image->parent_image = parent;
+               TAILQ_INSERT_TAIL(&parent->children, image, sibling);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (image->parent_image || !image->hidden)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name || image->parent_image)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+
+       for (i = 0; i < scan.cur; i++) {
+               image = scan.images[i];
+
+               if (!image->name)
+                       continue;
+
+               vhd_util_scan_pretty_print_tree(image, 0);
+       }
+}
+
+static void
+vhd_util_scan_print_image(struct vhd_image *image)
+{
+       int err;
+
+       if (!image->error && (flags & VHD_SCAN_PRETTY)) {
+               err = vhd_util_scan_pretty_add_image(image);
+               if (!err)
+                       return;
+
+               if (!image->error) {
+                       image->error   = err;
+                       image->message = "allocating memory";
+               }
+       }
+
+       vhd_util_scan_print_image_indent(image, 0);
+}
+
+static int
+vhd_util_scan_error(const char *file, int err)
+{
+       struct vhd_image image;
+
+       memset(&image, 0, sizeof(image));
+       image.name    = (char *)file;
+       image.error   = err;
+       image.message = "failure scanning target";
+
+       vhd_util_scan_print_image(&image);
+
+       /*
+       if (flags & VHD_SCAN_NOFAIL)
+               return 0;
+       */
+
+       return err;
+}
+
+static vhd_parent_locator_t *
+vhd_util_scan_get_parent_locator(vhd_context_t *vhd)
+{
+       int i;
+       vhd_parent_locator_t *loc;
+
+       loc = NULL;
+
+       for (i = 0; i < 8; i++) {
+               if (vhd->header.loc[i].code == PLAT_CODE_MACX) {
+                       loc = vhd->header.loc + i;
+                       break;
+               }
+
+               if (vhd->header.loc[i].code == PLAT_CODE_W2RU)
+                       loc = vhd->header.loc + i;
+
+               if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE)
+                       loc = vhd->header.loc + i;
+       }
+
+       return loc;
+}
+
+static inline int
+copy_name(char *dst, const char *src)
+{
+       if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN)
+               return 0;
+
+       return -ENAMETOOLONG;
+}
+
+/*
+ * LVHD stores realpath(parent) in parent locators, so
+ * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name>
+ */
+static int
+vhd_util_scan_extract_volume_name(char *dst, const char *src)
+{
+       char copy[VHD_MAX_NAME_LEN], *name, *s, *c;
+
+       name = strrchr(src, '/');
+       if (!name)
+               name = (char *)src;
+
+       /* convert single dashes to slashes, double dashes to single dashes */
+       for (c = copy, s = name; *s != '\0'; s++, c++) {
+               if (*s == '-') {
+                       if (s[1] != '-')
+                               *c = '/';
+                       else {
+                               s++;
+                               *c = '-';
+                       }
+               } else
+                       *c = *s;
+       }
+
+       *c = '\0';
+       c = strrchr(copy, '/');
+       if (c == name) {
+               /* unrecognized format */
+               strcpy(dst, src);
+               return -EINVAL;
+       }
+
+       strcpy(dst, ++c);
+       return 0;
+}
+
+static int
+vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char name[VHD_MAX_NAME_LEN];
+       vhd_parent_locator_t *loc, copy;
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       goto found;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       copy = *loc;
+       copy.data_offset += image->target->start;
+       err = vhd_parent_locator_read(vhd, &copy, &image->parent);
+       if (err)
+               return err;
+
+found:
+       err = vhd_util_scan_extract_volume_name(name, image->parent);
+       if (!err)
+               return copy_name(image->parent, name);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       vhd_parent_locator_t *loc;
+
+       if (!target_vhd(image->target->type)) {
+               image->parent = NULL;
+               return 0;
+       }
+
+       loc = NULL;
+
+       if (target_volume(image->target->type))
+               return vhd_util_scan_get_volume_parent(vhd, image);
+
+       if (flags & VHD_SCAN_FAST) {
+               err = vhd_header_decode_parent(vhd,
+                                              &vhd->header, &image->parent);
+               if (!err)
+                       return 0;
+       } else {
+               /*
+                * vhd_parent_locator_get checks for the existence of the 
+                * parent file. if this call succeeds, all is well; if not,
+                * we'll try to return whatever string we have before failing
+                * outright.
+                */
+               err = vhd_parent_locator_get(vhd, &image->parent);
+               if (!err)
+                       return 0;
+       }
+
+       loc = vhd_util_scan_get_parent_locator(vhd);
+       if (!loc)
+               return -EINVAL;
+
+       return vhd_parent_locator_read(vhd, loc, &image->parent);
+}
+
+static int
+vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, hidden;
+
+       err    = 0;
+       hidden = 0;
+
+       if (target_vhd(image->target->type))
+               err = vhd_hidden(vhd, &hidden);
+       else
+               hidden = 1;
+
+       if (err)
+               return err;
+
+       image->hidden = hidden;
+       return 0;
+}
+
+static int
+vhd_util_scan_get_marker(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       char marker;
+
+       err    = 0;
+       marker = 0;
+
+       if (target_vhd(image->target->type) && vhd_has_batmap(vhd))
+               err = vhd_marker(vhd, &marker);
+
+       image->marker = marker;
+       return err;
+}
+
+static int
+vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image)
+{
+       image->size = image->target->size;
+
+       if (target_vhd(image->target->type))
+               image->capacity = vhd->footer.curr_size;
+       else
+               image->capacity = image->size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err, vhd_flags;
+
+       if (!target_vhd(image->target->type))
+               return 0;
+
+       vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED;
+       if (flags & VHD_SCAN_FAST)
+               vhd_flags |= VHD_OPEN_FAST;
+
+       err = vhd_open(vhd, image->name, vhd_flags);
+       if (err) {
+               vhd->file      = NULL;
+               image->message = "opening file";
+               image->error   = err;
+               return image->error;
+       }
+
+       return 0;
+}
+
+static int
+vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       void *buf;
+       size_t size;
+       struct target *target;
+
+       buf    = NULL;
+       target = image->target;
+       size   = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               buf            = NULL;
+               image->message = "allocating image";
+               image->error   = -err;
+               goto out;
+       }
+
+       err = vhd_seek(vhd, target->start, SEEK_SET);
+       if (err) {
+               image->message = "seeking to headers";
+               image->error   = err;
+               goto out;
+       }
+
+       err = vhd_read(vhd, buf, size);
+       if (err) {
+               image->message = "reading headers";
+               image->error   = err;
+               goto out;
+       }
+
+       memcpy(&vhd->footer, buf, sizeof(vhd_footer_t));
+       vhd_footer_in(&vhd->footer);
+       err = vhd_validate_footer(&vhd->footer);
+       if (err) {
+               image->message = "invalid footer";
+               image->error   = err;
+               goto out;
+       }
+
+       /* lvhd vhds should always be dynamic */
+       if (vhd_type_dynamic(vhd)) {
+               if (vhd->footer.data_offset != sizeof(vhd_footer_t))
+                       err = vhd_read_header_at(vhd, &vhd->header,
+                                                vhd->footer.data_offset +
+                                                target->start);
+               else {
+                       memcpy(&vhd->header,
+                              buf + sizeof(vhd_footer_t),
+                              sizeof(vhd_header_t));
+                       vhd_header_in(&vhd->header);
+                       err = vhd_validate_header(&vhd->header);
+               }
+
+               if (err) {
+                       image->message = "reading header";
+                       image->error   = err;
+                       goto out;
+               }
+
+               vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT;
+               vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3);
+       }
+
+out:
+       free(buf);
+       return image->error;
+}
+
+static int
+vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image)
+{
+       struct target *target;
+
+       target = image->target;
+       memset(vhd, 0, sizeof(*vhd));
+       vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST;
+
+       if (target->end - target->start < 4096) {
+               image->message = "device too small";
+               image->error   = -EINVAL;
+               return image->error;
+       }
+
+       vhd->file = strdup(image->name);
+       if (!vhd->file) {
+               image->message = "allocating device";
+               image->error   = -ENOMEM;
+               return image->error;
+       }
+
+       vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE);
+       if (vhd->fd == -1) {
+               free(vhd->file);
+               vhd->file = NULL;
+
+               image->message = "opening device";
+               image->error   = -errno;
+               return image->error;
+       }
+
+       if (target_vhd(target->type))
+               return vhd_util_scan_read_volume_headers(vhd, image);
+
+       return 0;
+}
+
+static int
+vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image)
+{
+       struct target *target;
+
+       target = image->target;
+
+       if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY))
+               image->name = target->name;
+       else {
+               char __image_name[PATH_MAX];
+
+               image->name = realpath(target->name, __image_name);
+               if (image->name)
+                       image->name = strdup(__image_name);
+               if (!image->name) {
+                       image->name    = target->name;
+                       image->message = "resolving name";
+                       image->error   = -errno;
+                       return image->error;
+               }
+       }
+
+       if (target_volume(target->type))
+               return vhd_util_scan_open_volume(vhd, image);
+       else
+               return vhd_util_scan_open_file(vhd, image);
+}
+
+static int
+vhd_util_scan_init_file_target(struct target *target,
+                              const char *file, uint8_t type)
+{
+       int err;
+       struct stat stats;
+
+       err = stat(file, &stats);
+       if (err == -1)
+               return -errno;
+
+       err = copy_name(target->name, file);
+       if (err)
+               return err;
+
+       err = copy_name(target->device, file);
+       if (err)
+               return err;
+
+       target->type  = type;
+       target->start = 0;
+       target->size  = stats.st_size;
+       target->end   = stats.st_size;
+
+       return 0;
+}
+
+static int
+vhd_util_scan_init_volume_target(struct target *target,
+                                struct lv *lv, uint8_t type)
+{
+       int err;
+
+       if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR)
+               return -ENOSYS;
+
+       err = copy_name(target->name, lv->name);
+       if (err) {
+               EPRINTF("copy target name failed: '%s'\n", lv->name);
+               return err;
+       }
+
+       err = copy_name(target->device, lv->first_segment.device);
+       if (err) {
+               EPRINTF("copy target device failed: '%s'\n",
+                               lv->first_segment.device);
+               return err;
+       }
+
+       target->type  = type;
+       target->size  = lv->size;
+       target->start = lv->first_segment.pe_start;
+       target->end   = target->start + lv->first_segment.pe_size;
+
+       return 0;
+}
+
+static int
+iterator_init(struct iterator *itr, int cnt, struct target *targets)
+{
+       memset(itr, 0, sizeof(*itr));
+
+       itr->targets = malloc(sizeof(struct target) * cnt);
+       if (!itr->targets)
+               return -ENOMEM;
+
+       memcpy(itr->targets, targets, sizeof(struct target) * cnt);
+
+       itr->cur      = 0;
+       itr->cur_size = cnt;
+       itr->max_size = cnt;
+
+       return 0;
+}
+
+static struct target *
+iterator_next(struct iterator *itr)
+{
+       if (itr->cur == itr->cur_size)
+               return NULL;
+
+       return itr->targets + itr->cur++;
+}
+
+static int
+iterator_add_file(struct iterator *itr,
+                 struct target *target, const char *parent, uint8_t type)
+{
+       int i;
+       struct target *t;
+       char *lname, *rname;
+
+       for (i = 0; i < itr->cur_size; i++) {
+               t = itr->targets + i;
+               lname = basename((char *)t->name);
+               rname = basename((char *)parent);
+
+               if (!strcmp(lname, rname))
+                       return -EEXIST;
+       }
+
+       return vhd_util_scan_init_file_target(target, parent, type);
+}
+
+static int
+iterator_add_volume(struct iterator *itr,
+                   struct target *target, const char *parent, uint8_t type)
+{
+       int i, err;
+       struct lv *lv;
+
+       lv  = NULL;
+       err = -ENOENT;
+
+       for (i = 0; i < itr->cur_size; i++)
+               if (!strcmp(parent, itr->targets[i].name))
+                       return -EEXIST;
+
+       for (i = 0; i < vg.lv_cnt; i++) {
+               err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME);
+               if (err != FNM_NOMATCH) {
+                       lv = vg.lvs + i;
+                       break;
+               }
+       }
+
+       if (err && err != FNM_PATHNAME)
+               return err;
+
+       if (!lv)
+               return -ENOENT;
+
+       return vhd_util_scan_init_volume_target(target, lv, type);
+}
+
+static int
+iterator_add(struct iterator *itr, const char *parent, uint8_t type)
+{
+       int err;
+       struct target *target;
+
+       if (itr->cur_size == itr->max_size) {
+               struct target *new;
+
+               new = realloc(itr->targets,
+                             sizeof(struct target) *
+                             itr->max_size * 2);
+               if (!new)
+                       return -ENOMEM;
+
+               itr->max_size *= 2;
+               itr->targets   = new;
+       }
+
+       target = itr->targets + itr->cur_size;
+
+       if (target_volume(type))
+               err = iterator_add_volume(itr, target, parent, type);
+       else
+               err = iterator_add_file(itr, target, parent, type);
+
+       if (err)
+               memset(target, 0, sizeof(*target));
+       else
+               itr->cur_size++;
+
+       return (err == -EEXIST ? 0 : err);
+}
+
+static void
+iterator_free(struct iterator *itr)
+{
+       free(itr->targets);
+       memset(itr, 0, sizeof(*itr));
+}
+
+static void
+vhd_util_scan_add_parent(struct iterator *itr,
+                        vhd_context_t *vhd, struct vhd_image *image)
+{
+       int err;
+       uint8_t type;
+
+       if (vhd_parent_raw(vhd))
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE;
+       else
+               type = target_volume(image->target->type) ? 
+                       VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE;
+
+       err = iterator_add(itr, image->parent, type);
+       if (err)
+               vhd_util_scan_error(image->parent, err);
+}
+
+static int
+vhd_util_scan_targets(int cnt, struct target *targets)
+{
+       int ret, err;
+       vhd_context_t vhd;
+       struct iterator itr;
+       struct target *target;
+       struct vhd_image image;
+
+       ret = 0;
+       err = 0;
+
+       err = iterator_init(&itr, cnt, targets);
+       if (err)
+               return err;
+
+       while ((target = iterator_next(&itr))) {
+               memset(&vhd, 0, sizeof(vhd));
+               memset(&image, 0, sizeof(image));
+
+               image.target = target;
+
+               err = vhd_util_scan_open(&vhd, &image);
+               if (err) {
+                       ret = -EAGAIN;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_size(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "getting physical size";
+                       image.error   = err;
+                       goto end;
+               }
+
+               err = vhd_util_scan_get_hidden(&vhd, &image);
+               if (err) {
+                       ret           = -EAGAIN;
+                       image.message = "checking 'hidden' field";
+                       image.error   = err;
+                       goto end;
+               }
+
+               if (flags & VHD_SCAN_MARKERS) {
+                       err = vhd_util_scan_get_marker(&vhd, &image);
+                       if (err) {
+                               ret           = -EAGAIN;
+                               image.message = "checking marker";
+                               image.error   = err;
+                               goto end;
+                       }
+               }
+
+               if (vhd.footer.type == HD_TYPE_DIFF) {
+                       err = vhd_util_scan_get_parent(&vhd, &image);
+                       if (err) {
+                               ret           = -EAGAIN;
+                               image.message = "getting parent";
+                               image.error   = err;
+                               goto end;
+                       }
+               }
+
+       end:
+               vhd_util_scan_print_image(&image);
+
+               if (flags & VHD_SCAN_PARENTS && image.parent)
+                       vhd_util_scan_add_parent(&itr, &vhd, &image);
+
+               if (vhd.file)
+                       vhd_close(&vhd);
+               if (image.name != target->name)
+                       free(image.name);
+               free(image.parent);
+
+               if (err && !(flags & VHD_SCAN_NOFAIL))
+                       break;
+       }
+
+       iterator_free(&itr);
+
+       if (flags & VHD_SCAN_NOFAIL)
+               return ret;
+
+       return err;
+}
+
+static int
+vhd_util_scan_targets_pretty(int cnt, struct target *targets)
+{
+       int err;
+
+       err = vhd_util_scan_pretty_allocate_list(cnt);
+       if (err) {
+               printf("scan failed: no memory\n");
+               return -ENOMEM;
+       }
+
+       err = vhd_util_scan_targets(cnt, targets);
+
+       vhd_util_scan_pretty_print_images();
+       vhd_util_scan_pretty_free_list();
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+}
+
+static int
+vhd_util_scan_find_file_targets(int cnt, char **names,
+                               const char *filter,
+                               struct target **_targets, int *_total)
+{
+       glob_t g;
+       struct target *targets;
+       int i, globs, err, total;
+
+       total     = cnt;
+       globs     = 0;
+       *_total   = 0;
+       *_targets = NULL;
+       
+       memset(&g, 0, sizeof(g));
+
+       if (filter) {
+               int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0);
+
+               errno = 0;
+               err   = glob(filter, gflags, vhd_util_scan_error, &g);
+
+               switch (err) {
+               case GLOB_NOSPACE:
+                       err = -ENOMEM;
+                       break;
+               case GLOB_ABORTED:
+                       err = -EIO;
+                       break;
+               case GLOB_NOMATCH:
+                       err = -errno;
+                       break;
+               }
+
+               if (err) {
+                       vhd_util_scan_error(filter, err);
+                       return err;
+               }
+
+               globs  = g.gl_pathc;
+               total += globs;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < g.gl_pathc; i++) {
+               err = vhd_util_scan_init_file_target(targets + i,
+                                                    g.gl_pathv[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(g.gl_pathv[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       for (i = 0; i + globs < total; i++) {
+               err = vhd_util_scan_init_file_target(targets + i + globs,
+                                                    names[i],
+                                                    VHD_TYPE_VHD_FILE);
+               if (err) {
+                       vhd_util_scan_error(names[i], err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       if (filter)
+               globfree(&g);
+
+       return err;
+}
+
+static inline void
+swap_volume(struct lv *lvs, int dst, int src)
+{
+       struct lv copy, *ldst, *lsrc;
+
+       if (dst == src)
+               return;
+
+       lsrc = lvs + src;
+       ldst = lvs + dst;
+
+       memcpy(&copy, ldst, sizeof(copy));
+       memcpy(ldst, lsrc, sizeof(*ldst));
+       memcpy(lsrc, &copy, sizeof(copy));
+}
+
+static int
+vhd_util_scan_sort_volumes(struct lv *lvs, int cnt,
+                          const char *filter, int *_matches)
+{
+       struct lv *lv;
+       int i, err, matches;
+
+       matches   = 0;
+       *_matches = 0;
+
+       if (!filter)
+               return 0;
+
+       for (i = 0; i < cnt; i++) {
+               lv  = lvs + i;
+
+               err = fnmatch(filter, lv->name, FNM_PATHNAME);
+               if (err) {
+                       if (err != FNM_NOMATCH) {
+                               EPRINTF("fnmatch failed: '%s', '%s'\n", 
+                                               filter, lv->name);
+                               vhd_util_scan_error(lv->name, err);
+                               if (!(flags & VHD_SCAN_NOFAIL))
+                                       return err;
+                       }
+
+                       continue;
+               }
+
+               swap_volume(lvs, matches++, i);
+       }
+
+       *_matches = matches;
+       return 0;
+}
+
+static int
+vhd_util_scan_find_volume_targets(int cnt, char **names,
+                                 const char *volume, const char *filter,
+                                 struct target **_targets, int *_total)
+{
+       struct target *targets;
+       int i, err, total, matches;
+
+       *_total   = 0;
+       *_targets = NULL;
+       targets   = NULL;
+
+       err = lvm_scan_vg(volume, &vg);
+       if (err)
+               return err;
+
+       err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt,
+                                        filter, &matches);
+       if (err)
+               goto out;
+
+       total = matches;
+       for (i = 0; i < cnt; i++) {
+               err = vhd_util_scan_sort_volumes(vg.lvs + total,
+                                                vg.lv_cnt - total,
+                                                names[i], &matches);
+               if (err)
+                       goto out;
+
+               total += matches;
+       }
+
+       targets = calloc(total, sizeof(struct target));
+       if (!targets) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < total; i++) {
+               err = vhd_util_scan_init_volume_target(targets + i,
+                                                      vg.lvs + i,
+                                                      VHD_TYPE_VHD_VOLUME);
+               if (err) {
+                       vhd_util_scan_error(vg.lvs[i].name, err);
+                       if (!(flags & VHD_SCAN_NOFAIL))
+                               goto out;
+               }
+       }
+
+       err       = 0;
+       *_total   = total;
+       *_targets = targets;
+
+out:
+       if (err)
+               free(targets);
+       return err;
+}
+
+static int
+vhd_util_scan_find_targets(int cnt, char **names,
+                          const char *volume, const char *filter,
+                          struct target **targets, int *total)
+{
+       if (flags & VHD_SCAN_VOLUME)
+               return vhd_util_scan_find_volume_targets(cnt, names,
+                                                        volume, filter,
+                                                        targets, total);
+       return vhd_util_scan_find_file_targets(cnt, names,
+                                              filter, targets, total);
+}
+
+int
+vhd_util_scan(int argc, char **argv)
+{
+       int c, err, cnt;
+       char *filter, *volume;
+       struct target *targets;
+
+       cnt     = 0;
+       err     = 0;
+       flags   = 0;
+       filter  = NULL;
+       volume  = NULL;
+       targets = NULL;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "m:fcl:pavMh")) != -1) {
+               switch (c) {
+               case 'm':
+                       filter = optarg;
+                       break;
+               case 'f':
+                       flags |= VHD_SCAN_FAST;
+                       break;
+               case 'c':
+                       flags |= VHD_SCAN_NOFAIL;
+                       break;
+               case 'l':
+                       volume = optarg;
+                       flags |= VHD_SCAN_VOLUME;
+                       break;
+               case 'p':
+                       flags |= VHD_SCAN_PRETTY;
+                       break;
+               case 'a':
+                       flags |= VHD_SCAN_PARENTS;
+                       break;
+               case 'v':
+                       flags |= VHD_SCAN_VERBOSE;
+                       break;
+               case 'M':
+                       flags |= VHD_SCAN_MARKERS;
+                       break;
+               case 'h':
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!filter && argc - optind == 0) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       if (flags & VHD_SCAN_PRETTY)
+               flags &= ~VHD_SCAN_FAST;
+
+       err = vhd_util_scan_find_targets(argc - optind, argv + optind,
+                                        volume, filter, &targets, &cnt);
+       if (err) {
+               printf("scan failed: %d\n", err);
+               return err;
+       }
+
+       if (!cnt)
+               return 0;
+
+       if (flags & VHD_SCAN_PRETTY)
+               err = vhd_util_scan_targets_pretty(cnt, targets);
+       else
+               err = vhd_util_scan_targets(cnt, targets);
+
+       free(targets);
+       lvm_free_vg(&vg);
+
+       return ((flags & VHD_SCAN_NOFAIL) ? 0 : err);
+
+usage:
+       printf("usage: [OPTIONS] FILES\n"
+              "options: [-m match filter] [-f fast] [-c continue on failure] "
+              "[-l LVM volume] [-p pretty print] [-a scan parents] "
+              "[-v verbose] [-h help] [-M show markers]\n");
+       return err;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-set-field.c 
b/tools/blktap3/vhd/lib/vhd-util-set-field.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-set-field.c
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "libvhd.h"
+
+int
+vhd_util_set_field(int argc, char **argv)
+{
+       long value;
+       int err, c;
+       vhd_context_t vhd;
+       char *name, *field;
+
+       err   = -EINVAL;
+       value = 0;
+       name  = NULL;
+       field = NULL;
+
+       if (!argc || !argv)
+               goto usage;
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:f:v:h")) != -1) {
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'f':
+                       field = optarg;
+                       break;
+               case 'v':
+                       err   = 0;
+                       value = strtol(optarg, NULL, 10);
+                       break;
+               case 'h':
+               default:
+                       goto usage;
+               }
+       }
+
+       if (!name || !field || optind != argc || err)
+               goto usage;
+
+       if (strnlen(field, 25) >= 25) {
+               printf("invalid field\n");
+               goto usage;
+       }
+
+       if (strcmp(field, "hidden") && strcmp(field, "marker")) {
+               printf("invalid field %s\n", field);
+               goto usage;
+       }
+
+       if (value < 0 || value > 255) {
+               printf("invalid value %ld\n", value);
+               goto usage;
+       }
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDWR);
+       if (err) {
+               printf("error opening %s: %d\n", name, err);
+               return err;
+       }
+
+       if (!strcmp(field, "hidden")) {
+               vhd.footer.hidden = (char)value;
+               err = vhd_write_footer(&vhd, &vhd.footer);
+               if (err == -ENOSPC && vhd_type_dynamic(&vhd) && value)
+                       /* if no space to write the primary footer, at least 
write the 
+                        * backup footer so that it's possible to delete the 
VDI */
+                       err = vhd_write_footer_at(&vhd, &vhd.footer, 0);
+       } else {
+               err = vhd_set_marker(&vhd, (char)value);
+       }
+               
+       vhd_close(&vhd);
+       return err;
+
+usage:
+       printf("options: <-n name> <-f field> <-v value> [-h help]\n");
+       return -EINVAL;
+}
diff --git a/tools/blktap3/vhd/lib/vhd-util-snapshot.c 
b/tools/blktap3/vhd/lib/vhd-util-snapshot.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/lib/vhd-util-snapshot.c
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "libvhd.h"
+
+static int
+vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw)
+{
+       int i, err;
+       char *target;
+       vhd_context_t vhd;
+
+       *parent_raw = 0;
+       *result     = NULL;
+
+       target = strdup(name);
+       if (!target)
+               return -ENOMEM;
+
+       for (;;) {
+               err = vhd_open(&vhd, target, VHD_OPEN_RDONLY);
+               if (err)
+                       return err;
+
+               if (vhd.footer.type != HD_TYPE_DIFF)
+                       goto out;
+
+               err = vhd_get_bat(&vhd);
+               if (err)
+                       goto out;
+
+               for (i = 0; i < vhd.bat.entries; i++)
+                       if (vhd.bat.bat[i] != DD_BLK_UNUSED)
+                               goto out;
+
+               free(target);
+               err = vhd_parent_locator_get(&vhd, &target);
+               if (err)
+                       goto out;
+
+               if (vhd_parent_raw(&vhd)) {
+                       *parent_raw = 1;
+                       goto out;
+               }
+
+               vhd_close(&vhd);
+       }
+
+out:
+       vhd_close(&vhd);
+       if (err)
+               free(target);
+       else
+               *result = target;
+
+       return err;
+}
+
+static int
+vhd_util_check_depth(const char *name, int *depth)
+{
+       int err;
+       vhd_context_t vhd;
+
+       err = vhd_open(&vhd, name, VHD_OPEN_RDONLY);
+       if (err)
+               return err;
+
+       err = vhd_chain_depth(&vhd, depth);
+       vhd_close(&vhd);
+
+       return err;
+}
+
+int
+vhd_util_snapshot(int argc, char **argv)
+{
+       vhd_flag_creat_t flags;
+       int c, err, prt_raw, limit, empty_check;
+       char *name, *pname, *backing;
+       char *ppath, __ppath[PATH_MAX];
+       uint64_t size, msize;
+       vhd_context_t vhd;
+
+       name        = NULL;
+       pname       = NULL;
+       ppath       = NULL;
+       backing     = NULL;
+       size        = 0;
+       msize       = 0;
+       flags       = 0;
+       limit       = 0;
+       empty_check = 1;
+
+       if (!argc || !argv) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       optind = 0;
+       while ((c = getopt(argc, argv, "n:p:S:l:meh")) != -1) {
+
+               switch (c) {
+               case 'n':
+                       name = optarg;
+                       break;
+               case 'p':
+                       pname = optarg;
+                       break;
+               case 'S':
+                       msize = strtoull(optarg, NULL, 10);
+               case 'l':
+                       limit = strtol(optarg, NULL, 10);
+                       break;
+               case 'm':
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+                       break;
+               case 'e':
+                       empty_check = 0;
+                       break;
+               case 'h':
+                       err = 0;
+                       goto usage;
+               default:
+                       err = -EINVAL;
+                       goto usage;
+               }
+       }
+
+       if (!name || !pname || optind != argc) {
+               err = -EINVAL;
+               goto usage;
+       }
+
+       ppath = realpath(pname, __ppath);
+       if (!ppath)
+               return -errno;
+
+       if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW) || !empty_check) {
+               backing = strdup(ppath);
+               if (!backing) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+       } else {
+               err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw);
+               if (err) {
+                       backing = NULL;
+                       goto out;
+               }
+
+               /* 
+                * if the sizes of the parent chain are non-uniform, we need to 
+                * pick the right size: that of the supplied parent
+                */
+               if (strcmp(ppath, backing)) {
+                       err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY);
+                       if (err)
+                               goto out;
+                       size = vhd.footer.curr_size;
+                       vhd_close(&vhd);
+               }
+
+               if (prt_raw)
+                       vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW);
+       }
+
+       if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) {
+               int depth;
+
+               err = vhd_util_check_depth(backing, &depth);
+               if (err)
+                       printf("error checking snapshot depth: %d\n", err);
+               else if (depth + 1 > limit) {
+                       err = -ENOSPC;
+                       printf("snapshot depth exceeded: "
+                              "current depth: %d, limit: %d\n", depth, limit);
+               }
+
+               if (err)
+                       goto out;
+       }
+
+       err = vhd_snapshot(name, size, backing, msize << 20, flags);
+
+out:
+       free(backing);
+
+       return err;
+
+usage:
+       printf("options: <-n name> <-p parent name> [-l snapshot depth limit]"
+              " [-m parent_is_raw] [-S size (MB) for metadata preallocation "
+              "(see vhd-util resize)] [-e link to supplied parent name even "
+              "if it's empty] [-h help]\n");
+       return err;
+}
diff --git a/tools/blktap3/vhd/vhd-index.c b/tools/blktap3/vhd/vhd-index.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/vhd-index.c
@@ -0,0 +1,1012 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <limits.h>
+
+#include "libvhd.h"
+#include "libvhd-index.h"
+
+static void
+usage(void)
+{
+       printf("usage: vhd-index <command>\n"
+              "commands:\n"
+              "\t   index: <-i index name> <-v vhd file>\n"
+              "\t summary: <-s index name> [-v vhd file [-b block]]\n");
+       exit(-EINVAL);
+}
+
+typedef struct vhdi_name              vhdi_name_t;
+
+struct vhdi_name {
+       char                         *vhd;
+       char                         *bat;
+
+       char                         *base;
+       char                         *index;
+       char                         *files;
+};
+
+static int
+vhd_index_get_name(const char *index, const char *vhd, vhdi_name_t *name)
+{
+       int err, len;
+
+       memset(name, 0, sizeof(vhdi_name_t));
+
+       len = strnlen(index, VHD_MAX_NAME_LEN);
+       if (len + 5 >= VHD_MAX_NAME_LEN - 1)
+               return -ENAMETOOLONG;
+
+       if (vhd) {
+               len = strnlen(vhd, VHD_MAX_NAME_LEN);
+               if (len >= VHD_MAX_NAME_LEN - 1)
+                       return -ENAMETOOLONG;
+
+               err = asprintf(&name->vhd, "%s", vhd);
+               if (err == -1) {
+                       name->vhd = NULL;
+                       goto fail;
+               }
+
+               err = asprintf(&name->bat, "%s.bat", vhd);
+               if (err == -1) {
+                       name->bat = NULL;
+                       goto fail;
+               }
+       }
+
+       err = asprintf(&name->base, "%s", index);
+       if (err == -1) {
+               name->base = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&name->index, "%s.index", index);
+       if (err == -1) {
+               name->index = NULL;
+               goto fail;
+       }
+
+       err = asprintf(&name->files, "%s.files", index);
+       if (err == -1) {
+               name->files = NULL;
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       free(name->vhd);
+       free(name->bat);
+       free(name->base);
+       free(name->index);
+       free(name->files);
+
+       return -ENOMEM;
+}
+
+static inline void
+vhd_index_free_name(vhdi_name_t *name)
+{
+       free(name->vhd);
+       free(name->bat);
+       free(name->base);
+       free(name->index);
+       free(name->files);
+}
+
+static inline int
+vhd_index_add_file_table_entry(vhdi_name_t *name, const char *file,
+                              vhdi_file_table_t *files, vhdi_file_id_t *fid)
+{
+       int err;
+
+       vhdi_file_table_free(files);
+
+       err = vhdi_file_table_add(name->files, file, fid);
+       if (err)
+               return err;
+
+       return vhdi_file_table_load(name->files, files);
+}
+
+static inline int
+vhd_index_get_file_id(vhdi_name_t *name, const char *file,
+                     vhdi_file_table_t *files, vhdi_file_id_t *fid)
+{
+       char *path, __path[PATH_MAX];
+       int i;
+
+       path = realpath(file, __path);
+       if (!path)
+               return -errno;
+
+       for (i = 0; i < files->entries; i++)
+               if (!strcmp(files->table[i].path, path)) {
+                       *fid = files->table[i].file_id;
+                       return 0;
+               }
+
+       return vhd_index_add_file_table_entry(name, file, files, fid);
+}
+
+static inline int
+vhd_index_get_block(vhdi_context_t *vhdi, vhd_context_t *vhd,
+                   uint32_t block, vhdi_block_t *vhdi_block)
+{
+       int i;
+
+       if (block)
+               return vhdi_read_block(vhdi, vhdi_block, block);
+
+       vhdi_block->entries = vhd->spb;
+       vhdi_block->table   = calloc(vhd->spb, sizeof(vhdi_entry_t));
+       if (!vhdi_block->table)
+               return -ENOMEM;
+
+       for (i = 0; i < vhdi_block->entries; i++)
+               vhdi_block->table[i].offset = DD_BLK_UNUSED;
+
+       return 0;
+}
+
+static int
+vhd_index_add_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi,
+                       vhdi_bat_t *bat, vhdi_file_table_t *files,
+                       vhd_context_t *vhd, uint32_t block, char *finished)
+{
+       char *map;
+       vhdi_file_id_t fid;
+       uint32_t i, count, off;
+       vhdi_block_t vhdi_block;
+       int err, update, append;
+
+       fid    = 0;
+       count  = 0;
+       update = 0;
+       append = (bat->table[block] == 0);
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block);
+       if (err)
+               return err;
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto out;
+
+       err = vhd_index_get_file_id(name, vhd->file, files, &fid);
+       if (err)
+               goto out;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (vhdi_block.table[i].file_id) {
+                       count++;
+                       continue;
+               }
+
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off);
+               if (err)
+                       goto out;
+
+               vhdi_block.table[i].file_id = fid;
+               vhdi_block.table[i].offset  = off;
+               count++;
+               update++;
+       }
+
+       if (update) {
+               if (append) {
+                       uint32_t location;
+
+                       err = vhdi_append_block(vhdi, &vhdi_block, &location);
+                       if (err)
+                               goto out;
+
+                       bat->table[block] = location;
+               } else {
+                       err = vhdi_write_block(vhdi, &vhdi_block,
+                                              bat->table[block]);
+                       if (err)
+                               goto out;
+               }
+       }
+
+       if (count == vhd->spb)
+               *finished = 1;
+
+       err = 0;
+
+out:
+       free(vhdi_block.table);
+       free(map);
+
+       return err;
+}
+
+static int
+vhd_index_clone_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi,
+                         vhdi_bat_t *bat, vhdi_file_table_t *files,
+                         vhd_context_t *vhd, uint32_t block)
+{
+       char *map;
+       int err, update;
+       uint32_t i, off;
+       vhdi_file_id_t fid;
+       vhdi_block_t vhdi_block;
+
+       fid    = 0;
+       update = 0;
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block);
+       if (err)
+               return err;
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto out;
+
+       err = vhd_index_get_file_id(name, vhd->file, files, &fid);
+       if (err)
+               goto out;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off);
+               if (err)
+                       goto out;
+
+               vhdi_block.table[i].file_id = fid;
+               vhdi_block.table[i].offset  = off;
+               update++;
+       }
+
+       if (update) {
+               uint32_t location;
+
+               err = vhdi_append_block(vhdi, &vhdi_block, &location);
+               if (err)
+                       goto out;
+
+               bat->table[block] = location;
+       }
+
+       err = 0;
+
+out:
+       free(vhdi_block.table);
+       free(map);
+
+       return err;
+}
+
+static int
+vhd_index_update_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi,
+                          vhdi_bat_t *bat, vhdi_file_table_t *files,
+                          vhd_context_t *vhd, uint32_t block)
+{
+       char *map;
+       int err, update;
+       uint32_t i, off;
+       vhdi_file_id_t fid;
+       vhdi_block_t vhdi_block;
+
+       fid    = 0;
+       update = 0;
+
+       if (vhd->bat.bat[block] == DD_BLK_UNUSED)
+               return 0;
+
+       err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block);
+       if (err)
+               return err;
+
+       err = vhd_read_bitmap(vhd, block, &map);
+       if (err)
+               goto out;
+
+       err = vhd_index_get_file_id(name, vhd->file, files, &fid);
+       if (err)
+               goto out;
+
+       for (i = 0; i < vhd->spb; i++) {
+               if (!vhd_bitmap_test(vhd, map, i))
+                       continue;
+
+               err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off);
+               if (err)
+                       goto out;
+
+               if (vhdi_block.table[i].file_id == fid &&
+                   vhdi_block.table[i].offset  == off)
+                       continue;
+
+               vhdi_block.table[i].file_id = fid;
+               vhdi_block.table[i].offset  = off;
+               update++;
+       }
+
+       if (update) {
+               uint32_t location;
+
+               err = vhdi_append_block(vhdi, &vhdi_block, &location);
+               if (err)
+                       goto out;
+
+               bat->table[block] = location;
+       }
+
+       err = 0;
+
+out:
+       free(vhdi_block.table);
+       free(map);
+
+       return err;
+}
+
+static int
+vhd_index_add_bat(vhdi_name_t *name,
+                 uint64_t vhd_blocks, uint32_t vhd_block_size)
+{
+       int err;
+       vhdi_bat_t bat;
+       vhd_context_t vhd;
+       vhdi_context_t vhdi;
+       vhdi_file_table_t files;
+       char *vhd_file, *finished;
+       uint32_t block, remaining;
+
+       memset(&bat, 0, sizeof(vhdi_bat_t));
+       memset(&files, 0, sizeof(vhdi_file_table_t));
+
+       vhd_file           = NULL;
+       finished           = NULL;
+       bat.vhd_blocks     = vhd_blocks;
+       bat.vhd_block_size = vhd_block_size;
+
+       strcpy(bat.vhd_path, name->vhd);
+       strcpy(bat.index_path, name->index);
+       strcpy(bat.file_table_path, name->files);
+
+       err = vhdi_open(&vhdi, name->index, O_RDWR);
+       if (err)
+               return err;
+
+       err = vhdi_file_table_load(name->files, &files);
+       if (err) {
+               vhdi_close(&vhdi);
+               return err;
+       }
+
+       err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files);
+       if (err)
+               goto out;
+
+       bat.table = calloc(vhd_blocks, sizeof(uint32_t));
+       if (!bat.table) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       vhd_file = strdup(name->vhd);
+       if (!vhd_file)
+               goto out;
+
+       remaining = vhd_blocks;
+       finished  = calloc(remaining, sizeof(char));
+       if (!finished) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (;;) {
+               err = vhd_open(&vhd, vhd_file, VHD_OPEN_RDONLY);
+               if (err)
+                       goto out;
+
+               err = vhd_get_bat(&vhd);
+               if (err)
+                       goto out_vhd;
+
+               for (block = 0; block < vhd.bat.entries; block++) {
+                       if (finished[block])
+                               continue;
+
+                       err = vhd_index_add_bat_entry(name, &vhdi, &bat,
+                                                     &files, &vhd, block,
+                                                     &finished[block]);
+                       if (err)
+                               goto out_bat;
+
+                       if (finished[block])
+                               remaining--;
+               }
+
+               free(vhd_file);
+               vhd_file = NULL;
+
+               if (!remaining || vhd.footer.type != HD_TYPE_DIFF) {
+                       vhd_put_bat(&vhd);
+                       vhd_close(&vhd);
+                       break;
+               }
+
+               err = vhd_parent_locator_get(&vhd, &vhd_file);
+               if (err)
+                       goto out_bat;
+
+       out_bat:
+               vhd_put_bat(&vhd);
+       out_vhd:
+               vhd_close(&vhd);
+               if (err)
+                       goto out;
+       } 
+
+       err = vhdi_bat_write(name->bat, &bat);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       if (err)
+               unlink(name->bat);
+
+       vhdi_file_table_free(&files);
+       vhdi_close(&vhdi);
+       free(bat.table);
+       free(finished);
+       free(vhd_file);
+
+       return err;
+}
+
+static int
+vhd_index_clone_bat(vhdi_name_t *name, const char *parent)
+{
+       int err;
+       char *pbat;
+       uint32_t block;
+       vhdi_bat_t bat;
+       vhd_context_t vhd;
+       vhdi_context_t vhdi;
+       vhdi_file_table_t files;
+
+       memset(&bat, 0, sizeof(vhdi_bat_t));
+       memset(&files, 0, sizeof(vhdi_file_table_t));
+
+       err = asprintf(&pbat, "%s.bat", parent);
+       if (err == -1)
+               return -ENOMEM;
+
+       err = access(pbat, R_OK);
+       if (err == -1) {
+               free(pbat);
+               return -errno;
+       }
+
+       err = vhdi_open(&vhdi, name->index, O_RDWR);
+       if (err)
+               goto out;
+
+       err = vhdi_bat_load(pbat, &bat);
+       if (err)
+               goto out_vhdi;
+
+       err = vhdi_file_table_load(name->files, &files);
+       if (err)
+               goto out_vhdi;
+
+       err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files);
+       if (err)
+               goto out_ft;
+
+       err = vhdi_bat_write(name->bat, &bat);
+       if (err)
+               goto out_ft;
+
+       err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY);
+       if (err)
+               goto out_ft;
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto out_vhd;
+
+       for (block = 0; block < vhd.bat.entries; block++) {
+               err = vhd_index_clone_bat_entry(name, &vhdi, &bat,
+                                               &files, &vhd, block);
+               if (err)
+                       goto out_bat;
+       }
+
+       err = vhdi_bat_write(name->bat, &bat);
+       if (err)
+               goto out_bat;
+
+       err = 0;
+
+out_bat:
+       vhd_put_bat(&vhd);
+out_vhd:
+       vhd_close(&vhd);
+out_ft:
+       vhdi_file_table_free(&files);
+out_vhdi:
+       vhdi_close(&vhdi);
+out:
+       if (err)
+               unlink(name->bat);
+       free(bat.table);
+       free(pbat);
+       return err;
+}
+
+static int
+vhd_index_update_bat(vhdi_name_t *name)
+{
+       int err;
+       uint32_t block;
+       vhdi_bat_t bat;
+       vhd_context_t vhd;
+       vhdi_context_t vhdi;
+       vhdi_file_table_t files;
+
+       memset(&bat, 0, sizeof(vhdi_bat_t));
+       memset(&files, 0, sizeof(vhdi_file_table_t));
+
+       err = access(name->bat, R_OK);
+       if (err == -1)
+               return -errno;
+
+       err = vhdi_open(&vhdi, name->index, O_RDWR);
+       if (err)
+               goto out;
+
+       err = vhdi_bat_load(name->bat, &bat);
+       if (err)
+               goto out_vhdi;
+
+       err = vhdi_file_table_load(name->files, &files);
+       if (err)
+               goto out_vhdi;
+
+       err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY);
+       if (err)
+               goto out_ft;
+
+       err = vhd_get_bat(&vhd);
+       if (err)
+               goto out_vhd;
+
+       for (block = 0; block < vhd.bat.entries; block++) {
+               err = vhd_index_update_bat_entry(name, &vhdi, &bat,
+                                                &files, &vhd, block);
+               if (err)
+                       goto out_bat;
+       }
+
+       err = vhdi_bat_write(name->bat, &bat);
+       if (err)
+               goto out_bat;
+
+       err = 0;
+
+out_bat:
+       vhd_put_bat(&vhd);
+out_vhd:
+       vhd_close(&vhd);
+out_ft:
+       vhdi_file_table_free(&files);
+out_vhdi:
+       vhdi_close(&vhdi);
+out:
+       free(bat.table);
+       return err;
+}
+
+static int
+vhd_index_create(vhdi_name_t *name)
+{
+       int err;
+       vhd_context_t ctx;
+       uint32_t block_size;
+
+       if (!access(name->index, F_OK) || !access(name->files, F_OK))
+               return -EEXIST;
+
+       err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY);
+       if (err)
+               return err;
+
+       err = vhd_get_header(&ctx);
+       if (err) {
+               vhd_close(&ctx);
+               return err;
+       }
+
+       block_size = ctx.header.block_size;
+       vhd_close(&ctx);
+
+       err = vhdi_create(name->index, block_size);
+       if (err)
+               goto out;
+
+       err = vhdi_file_table_create(name->files);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       if (err) {
+               unlink(name->index);
+               unlink(name->files);
+       }
+
+       return err;
+}
+
+static int
+vhd_index(vhdi_name_t *name)
+{
+       char *parent;
+       vhd_context_t ctx;
+       uint64_t vhd_blocks;
+       uint32_t vhd_block_size;
+       int err, new_index, new_bat;
+
+       parent    = NULL;
+       new_bat   = 0;
+       new_index = 0;
+
+       /* find vhd's parent -- we only index read-only vhds */
+       err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY);
+       if (err)
+               return err;
+
+       err = vhd_parent_locator_get(&ctx, &parent);
+       vhd_close(&ctx);
+
+       if (err)
+               return err;
+
+       /* update name to point to parent */
+       free(name->vhd);
+       name->vhd = parent;
+       parent = NULL;
+
+       free(name->bat);
+       err = asprintf(&name->bat, "%s.bat", name->vhd);
+       if (err == -1) {
+               name->bat = NULL;
+               return -ENOMEM;
+       }
+
+       /* create index if it doesn't already exist */
+       err = access(name->index, R_OK | W_OK);
+       if (err == -1 && errno == ENOENT) {
+               new_index = 1;
+               err = vhd_index_create(name);
+       }
+
+       if (err)
+               return err;
+
+       /* get basic vhd info */
+       err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY);
+       if (err)
+               goto out;
+
+       err = vhd_get_header(&ctx);
+       if (err) {
+               vhd_close(&ctx);
+               goto out;
+       }
+
+       vhd_blocks     = ctx.header.max_bat_size;
+       vhd_block_size = ctx.header.block_size;
+
+       if (vhd_parent_locator_get(&ctx, &parent))
+               parent = NULL;
+
+       vhd_close(&ctx);
+
+       /* update existing bat if it exists */
+       err = vhd_index_update_bat(name);
+       if (err != -ENOENT)
+               goto out;
+
+       new_bat = 1;
+
+       if (parent) {
+               /* clone parent bat if it exists */
+               err = vhd_index_clone_bat(name, parent);
+               if (err != -ENOENT)
+                       goto out;
+       }
+
+       /* create new bat from scratch */
+       err = vhd_index_add_bat(name, vhd_blocks, vhd_block_size);
+       if (err)
+               goto out;
+
+       err = 0;
+
+out:
+       if (err) {
+               if (new_bat)
+                       unlink(name->bat);
+               if (new_index) {
+                       unlink(name->index);
+                       unlink(name->files);
+               }
+       }
+       free(parent);
+       return err;
+}
+
+static void
+vhd_index_print_summary(vhdi_name_t *name,
+                       uint32_t block_size, vhdi_file_table_t *files)
+{
+       int i;
+       char time[26], uuid[37];
+
+       printf("VHD INDEX          : %s\n", name->index);
+       printf("--------------------\n");
+       printf("block size         : %u\n", block_size);
+       printf("files              : %d\n", files->entries);
+
+       printf("\n");
+       for (i = 0; i < files->entries; i++) {
+               uuid_unparse(files->table[i].vhd_uuid, uuid);
+               vhd_time_to_string(files->table[i].vhd_timestamp, time);
+
+               printf("        fid 0x%04x : %s, %s, %s\n",
+                      files->table[i].file_id, files->table[i].path, uuid, 
time);
+       }
+
+       printf("\n");
+}
+
+static inline void
+vhd_index_print_bat_header(const char *name, vhdi_bat_t *bat)
+{
+       printf("VHD INDEX BAT      : %s\n", name);
+       printf("--------------------\n");
+       printf("blocks             : %"PRIu64"\n", bat->vhd_blocks);
+       printf("block size         : %u\n", bat->vhd_block_size);
+       printf("vhd path           : %s\n", bat->vhd_path);
+       printf("index path         : %s\n", bat->index_path);
+       printf("file table path    : %s\n", bat->file_table_path);
+}
+
+static int
+vhd_index_print_vhd_summary(vhdi_name_t *name)
+{
+       int err;
+       uint32_t i;
+       vhdi_bat_t bat;
+
+       err = vhdi_bat_load(name->bat, &bat);
+       if (err)
+               return err;
+
+       vhd_index_print_bat_header(name->bat, &bat);
+
+       printf("\n");
+       for (i = 0; i < bat.vhd_blocks; i++)
+               printf("      block 0x%04x : offset 0x%08x\n", i, bat.table[i]);
+
+       free(bat.table);
+       return 0;
+}
+
+static int
+vhd_index_print_vhd_block_summary(vhdi_name_t *name, uint32_t block)
+{
+       int err;
+       int i;
+       uint32_t off;
+       vhdi_bat_t bat;
+       vhdi_context_t vhdi;
+       vhdi_block_t vhdi_block;
+
+       err = vhdi_bat_load(name->bat, &bat);
+       if (err)
+               return err;
+
+       vhd_index_print_bat_header(name->bat, &bat);
+
+       if (block > bat.vhd_blocks) {
+               printf("block %u past end of bat (%"PRIu64")\n",
+                      block, bat.vhd_blocks);
+               err = -EINVAL;
+               goto out;
+       }
+
+       off = bat.table[block];
+       if (off == DD_BLK_UNUSED) {
+               printf("block %u is unallocated\n", block);
+               err = 0;
+               goto out;
+       }
+
+       err = vhdi_open(&vhdi, name->index, O_RDWR);
+       if (err)
+               goto out;
+
+       err = vhdi_read_block(&vhdi, &vhdi_block, off);
+       vhdi_close(&vhdi);
+       if (err)
+               goto out;
+
+       printf("\nBLOCK 0x%08x\n", block);
+       for (i = 0; i < vhdi_block.entries; i++)
+               printf("        sec 0x%04x : fid 0x%04x, offset 0x%08x\n", i,
+                      vhdi_block.table[i].file_id,
+                      vhdi_block.table[i].offset);
+
+       free(vhdi_block.table);
+       err = 0;
+
+out:
+       free(bat.table);
+       return err;
+}
+
+static int
+vhd_index_summary(vhdi_name_t *name, uint32_t block)
+{
+       int err;
+       uint32_t block_size;
+       vhdi_context_t vhdi;
+       vhdi_file_table_t files;
+
+       err = vhdi_open(&vhdi, name->index, O_RDWR);
+       if (err)
+               return err;
+
+       block_size = vhdi.vhd_block_size;
+       vhdi_close(&vhdi);
+
+       err = vhdi_file_table_load(name->files, &files);
+       if (err)
+               return err;
+
+       vhd_index_print_summary(name, block_size, &files);
+
+       if (name->vhd) {
+               if (block == (uint32_t)-1)
+                       err = vhd_index_print_vhd_summary(name);
+               else
+                       err = vhd_index_print_vhd_block_summary(name, block);
+
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+
+out:
+       vhdi_file_table_free(&files);
+       return err;
+}
+
+int
+main(int argc, char *argv[])
+{
+       int err;
+       uint32_t block;
+       vhdi_name_t name;
+       char *vhd, *index;
+       int c, update, summary;
+
+       vhd     = NULL;
+       index   = NULL;
+       block   = (uint32_t)-1;
+       update  = 0;
+       summary = 0;
+
+       while ((c = getopt(argc, argv, "i:v:s:b:h")) != -1) {
+               switch (c) {
+               case 'i':
+                       index   = optarg;
+                       update  = 1;
+                       break;
+
+               case 'v':
+                       vhd     = optarg;
+                       break;
+
+               case 's':
+                       index   = optarg;
+                       summary = 1;
+                       break;
+
+               case 'b':
+                       block   = strtoul(optarg, NULL, 10);
+                       break;
+
+               default:
+                       usage();
+               }
+       }
+
+       if (optind != argc)
+               usage();
+
+       if (!(update ^ summary))
+               usage();
+
+       if (block != (uint32_t)-1 && (!summary || !vhd))
+               usage();
+
+       err = vhd_index_get_name(index, vhd, &name);
+       if (err)
+               goto out;
+
+       if (summary)
+               err = vhd_index_summary(&name, block);
+       else if (update) {
+               if (!vhd)
+                       usage();
+
+               err = vhd_index(&name);
+       }
+
+out:
+       vhd_index_free_name(&name);
+       return -err;
+}
diff --git a/tools/blktap3/vhd/vhd-update.c b/tools/blktap3/vhd/vhd-update.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/vhd-update.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2007, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+* Before updating a VHD file, we create a journal consisting of:
+ *   - all data at the beginning of the file, up to and including the BAT
+ *   - each allocated bitmap (existing at the same offset in the journal as
+ *                            its corresponding bitmap in the original file)
+ * Updates are performed in place by writing appropriately 
+ * transformed versions of journaled bitmaps to the original file.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <endian.h>
+#include <byteswap.h>
+
+#include "libvhd.h"
+#include "libvhd-journal.h"
+
+static void
+usage(void)
+{
+       printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n");
+       exit(EINVAL);
+}
+
+/*
+ * update vhd creator version to reflect its new bitmap ordering
+ */
+static inline int
+update_creator_version(vhd_journal_t *journal)
+{
+       journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1);
+       return vhd_write_footer(&journal->vhd, &journal->vhd.footer);
+}
+
+static int
+journal_bitmaps(vhd_journal_t *journal)
+{
+       unsigned int i;
+    int err;
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA);
+               if (err)
+                       return err;
+       }
+
+       return 0;
+}
+
+/*
+ * older VHD bitmaps were little endian
+ * and bits within a word were set from right to left
+ */
+static inline int
+old_test_bit(int nr, volatile void * addr)
+{
+        return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+                (nr % (sizeof(unsigned long)*8))) & 1;
+}
+
+/*
+ * new VHD bitmaps are big endian
+ * and bits within a word are set from left to right
+ */
+#define BIT_MASK 0x80
+static inline void
+new_set_bit (int nr, volatile char *addr)
+{
+        addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
+}
+
+static void
+convert_bitmap(char *in, char *out, int bytes)
+{
+       int i;
+
+       memset(out, 0, bytes);
+
+       for (i = 0; i < bytes << 3; i++)
+               if (old_test_bit(i, (void *)in))
+                       new_set_bit(i, out);
+}
+
+static int
+update_vhd(vhd_journal_t *journal, int rollback)
+{
+       unsigned int i;
+    int err;
+       size_t size;
+       char *buf;
+       void *converted;
+
+       buf       = NULL;
+       converted = NULL;
+
+       size = vhd_bytes_padded(journal->vhd.spb / 8);
+       err  = posix_memalign(&converted, 512, size);
+       if (err) {
+               converted = NULL;
+               goto out;
+       }
+
+       for (i = 0; i < journal->vhd.bat.entries; i++) {
+               if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED)
+                       continue;
+
+               err = vhd_read_bitmap(&journal->vhd, i, &buf);
+               if (err)
+                       goto out;
+
+               if (rollback)
+                       memcpy(converted, buf, size);
+               else
+                       convert_bitmap(buf, converted, size);
+
+               free(buf);
+
+               err = vhd_write_bitmap(&journal->vhd, i, converted);
+               if (err)
+                       goto out;
+       }
+
+       err = 0;
+ out:
+       free(converted);
+       return err;
+}
+
+static int
+open_journal(vhd_journal_t *journal, const char *file, const char *jfile)
+{
+       int err;
+
+       err = vhd_journal_create(journal, file, jfile);
+       if (err) {
+               printf("error creating journal for %s: %d\n", file, err);
+               return err;
+       }
+
+       return 0;
+}
+
+static int
+close_journal(vhd_journal_t *journal, int err)
+{
+       if (err)
+               err = vhd_journal_revert(journal);
+       else
+               err = vhd_journal_commit(journal);
+
+       if (err)
+               return vhd_journal_close(journal);
+       else
+               return vhd_journal_remove(journal);
+}
+
+int
+main(int argc, char **argv)
+{
+       char *file, *jfile;
+       int c, err, rollback;
+       vhd_journal_t journal;
+
+       file     = NULL;
+       jfile    = NULL;
+       rollback = 0;
+
+       while ((c = getopt(argc, argv, "n:j:rh")) != -1) {
+               switch(c) {
+               case 'n':
+                       file = optarg;
+                       break;
+               case 'j':
+                       jfile = optarg;
+                       err = access(jfile, R_OK);
+                       if (err == -1) {
+                               printf("invalid journal arg %s\n", jfile);
+                               return -errno;
+                       }
+                       break;
+               case 'r':
+                       /* add a rollback option for debugging which
+                        * pushes journalled bitmaps to original file
+                        * without transforming them */
+                       rollback = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+
+       if (!file)
+               usage();
+
+       if (rollback && !jfile) {
+               printf("rollback requires a journal argument\n");
+               usage();
+       }
+
+       err = open_journal(&journal, file, jfile);
+       if (err)
+               return err;
+
+       if (!vhd_creator_tapdisk(&journal.vhd) ||
+           journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) ||
+           journal.vhd.footer.type == HD_TYPE_FIXED) {
+               err = 0;
+               goto out;
+       }
+
+       err = journal_bitmaps(&journal);
+       if (err) {
+               /* no changes to vhd file yet,
+                * so close the journal and bail */
+               vhd_journal_close(&journal);
+               return err;
+       }
+
+       err = update_vhd(&journal, rollback);
+       if (err) {
+               printf("update failed: %d; saving journal\n", err);
+               goto out;
+       }
+
+       err = update_creator_version(&journal);
+       if (err) {
+               printf("failed to udpate creator version: %d\n", err);
+               goto out;
+       }
+
+       err = 0;
+
+out:
+       err = close_journal(&journal, err);
+       return err;
+}
diff --git a/tools/blktap3/vhd/vhd-util.c b/tools/blktap3/vhd/vhd-util.c
new file mode 100644
--- /dev/null
+++ b/tools/blktap3/vhd/vhd-util.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of XenSource Inc. nor the names of its contributors
+ *       may be used to endorse or promote products derived from this software
+ *       without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libvhd.h"
+#include "vhd-util.h"
+
+#if 1
+#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a)
+#else
+#define DFPRINTF(_f, _a...) ((void)0)
+#endif
+
+typedef int (*vhd_util_func_t) (int, char **);
+
+struct command {
+       char               *name;
+       vhd_util_func_t     func;
+};
+
+struct command commands[] = {
+       { .name = "create",      .func = vhd_util_create        },
+       { .name = "snapshot",    .func = vhd_util_snapshot      },
+       { .name = "query",       .func = vhd_util_query         },
+       { .name = "read",        .func = vhd_util_read          },
+       { .name = "set",         .func = vhd_util_set_field     },
+       { .name = "repair",      .func = vhd_util_repair        },
+       { .name = "resize",      .func = vhd_util_resize        },
+       { .name = "fill",        .func = vhd_util_fill          },
+       { .name = "coalesce",    .func = vhd_util_coalesce      },
+       { .name = "modify",      .func = vhd_util_modify        },
+       { .name = "scan",        .func = vhd_util_scan          },
+       { .name = "check",       .func = vhd_util_check         },
+       { .name = "revert",      .func = vhd_util_revert        },
+};
+
+#define print_commands()                                       \
+       do {                                                    \
+               int i, n;                                       \
+               n = sizeof(commands) / sizeof(struct command);  \
+               printf("COMMAND := { ");                        \
+               printf("%s", commands[0].name);                 \
+               for (i = 1; i < n; i++)                         \
+                       printf(" | %s", commands[i].name);      \
+               printf(" }\n");                                 \
+       } while (0)
+
+TEST_FAIL_EXTERN_VARS;
+
+void
+help(void)
+{
+       printf("usage: vhd-util COMMAND [OPTIONS]\n");
+       print_commands();
+       exit(0);
+}
+
+struct command *
+get_command(char *command)
+{
+       int i, n;
+
+       if (strnlen(command, 25) >= 25)
+               return NULL;
+
+       n = sizeof(commands) / sizeof (struct command);
+
+       for (i = 0; i < n; i++)
+               if (!strcmp(command, commands[i].name))
+                       return &commands[i];
+
+       return NULL;
+}
+
+int
+main(int argc, char *argv[])
+{
+       char **cargv;
+       struct command *cmd;
+       int cargc, i, cnt, ret;
+
+#ifdef CORE_DUMP
+       #include <sys/resource.h>
+       struct rlimit rlim;
+       rlim.rlim_cur = RLIM_INFINITY;
+       rlim.rlim_max = RLIM_INFINITY;
+       if (setrlimit(RLIMIT_CORE, &rlim) < 0)
+               fprintf(stderr, "setrlimit failed: %d\n", errno);
+#endif
+
+       ret = 0;
+
+       if (argc < 2)
+               help();
+
+       cargc = argc - 1;
+       cmd   = get_command(argv[1]);
+       if (!cmd) {
+               fprintf(stderr, "invalid COMMAND %s\n", argv[1]);
+               help();
+       }
+
+       cargv = malloc(sizeof(char *) * cargc);
+       if (!cargv)
+               exit(ENOMEM);
+
+       cnt      = 1;
+       cargv[0] = cmd->name;
+       for (i = 1; i < cargc; i++) {
+               char *arg = argv[i + (argc - cargc)];
+
+               if (!strcmp(arg, "--debug")) {
+                       libvhd_set_log_level(1);
+                       continue;
+               }
+
+               cargv[cnt++] = arg;
+       }
+
+#ifdef ENABLE_FAILURE_TESTING
+       for (i = 0; i < NUM_FAIL_TESTS; i++) {
+               TEST_FAIL[i] = 0;
+               if (getenv(ENV_VAR_FAIL[i]))
+                       TEST_FAIL[i] = 1;
+       }
+#endif // ENABLE_FAILURE_TESTING
+
+       ret = cmd->func(cnt, cargv);
+
+       free(cargv);
+
+       return (ret >= 0 ? ret : -ret);
+}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.