[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH 4 of 8 RESEND] blktap3/vhd: Introduce core VHD library



This patch copies the core of the VHD functionality from blktap2, with most
changes coming from blktap2.5.

Signed-off-by: Thanos Makatos <thanos.makatos@xxxxxxxxxx>

diff --git a/tools/blktap2/vhd/lib/libvhd.c b/tools/blktap3/vhd/lib/libvhd.c
copy from tools/blktap2/vhd/lib/libvhd.c
copy to tools/blktap3/vhd/lib/libvhd.c
--- a/tools/blktap2/vhd/lib/libvhd.c
+++ b/tools/blktap3/vhd/lib/libvhd.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2008, XenSource Inc.
+ * Copyright (c) 2010, Citrix Systems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,12 +36,16 @@
 #include <string.h>
 #include <libgen.h>
 #include <iconv.h>
+#include <limits.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 
 #include "libvhd.h"
 #include "relative-path.h"
 
+#define VHD_HEADER_MAX_RETRIES 10
+
 static int libvhd_dbg = 0;
 
 void
@@ -57,7 +62,13 @@ libvhd_set_log_level(int level)
                               __func__, ##_a);                         \
        } while (0)
 
-#define BIT_MASK 0x80
+#define ASSERT(_p)                                                     \
+       if (!(_p)) {                                                    \
+               libvhd_set_log_level(1);                                        
                        \
+               VHDLOG("%s:%d: FAILED ASSERTION: '%s'\n",               \
+                       __FILE__, __LINE__, #_p);                       \
+               *(int*)0 = 0;                                           \
+       }
 
 #ifdef ENABLE_FAILURE_TESTING
 const char* ENV_VAR_FAIL[NUM_FAIL_TESTS] = {
@@ -69,26 +80,15 @@ const char* ENV_VAR_FAIL[NUM_FAIL_TESTS]
        "VHD_UTIL_TEST_FAIL_RESIZE_METADATA_MOVED",
        "VHD_UTIL_TEST_FAIL_RESIZE_END"
 };
+
 int TEST_FAIL[NUM_FAIL_TESTS];
 #endif // ENABLE_FAILURE_TESTING
 
-static inline int
-test_bit (volatile char *addr, int nr)
-{
-       return ((addr[nr >> 3] << (nr & 7)) & BIT_MASK) != 0;
-}
-
-static inline void
-set_bit (volatile char *addr, int nr)
-{
-       addr[nr >> 3] |= (BIT_MASK >> (nr & 7));
-}
-
-static inline void
-clear_bit (volatile char *addr, int nr)
-{
-       addr[nr >> 3] &= ~(BIT_MASK >> (nr & 7));
-}
+static void vhd_cache_init(vhd_context_t *);
+static int vhd_cache_enabled(vhd_context_t *);
+static int vhd_cache_load(vhd_context_t *);
+static int vhd_cache_unload(vhd_context_t *);
+static vhd_context_t *vhd_cache_get_parent(vhd_context_t *);
 
 static inline int
 old_test_bit(volatile char *addr, int nr)
@@ -251,8 +251,8 @@ vhd_validate_footer(vhd_footer_t *footer
        if (memcmp(footer->cookie, HD_COOKIE, csize) != 0 &&
            memcmp(footer->cookie, VHD_POISON_COOKIE, csize) != 0) {
                char buf[9];
-               strncpy(buf, footer->cookie, sizeof(buf));
-               buf[sizeof(buf)-1]= '\0';
+               memcpy(buf, footer->cookie, 8);
+               buf[8] = '\0';
                VHDLOG("invalid footer cookie: %s\n", buf);
                return -EINVAL;
        }
@@ -312,8 +312,8 @@ vhd_validate_header(vhd_header_t *header
 
        if (memcmp(header->cookie, DD_COOKIE, 8) != 0) {
                char buf[9];
-               strncpy(buf, header->cookie, sizeof(buf));
-               buf[sizeof(buf)-1]= '\0';
+               memcpy(buf, header->cookie, 8);
+               buf[8] = '\0';
                VHDLOG("invalid header cookie: %s\n", buf);
                return -EINVAL;
        }
@@ -323,8 +323,8 @@ vhd_validate_header(vhd_header_t *header
                return -EINVAL;
        }
 
-       if (header->data_offset != 0xFFFFFFFFFFFFFFFF) {
-               VHDLOG("invalid header data_offset 0x%016"PRIx64"\n",
+       if (header->data_offset != 0xFFFFFFFFFFFFFFFFULL) {
+               VHDLOG("invalid header data_offset 0x%016" PRIx64 "\n",
                       header->data_offset);
                return -EINVAL;
        }
@@ -355,18 +355,22 @@ vhd_validate_bat(vhd_bat_t *bat)
 }
 
 uint32_t
-vhd_checksum_batmap(vhd_batmap_t *batmap)
+vhd_checksum_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
-       int i, n;
+       int i;
        char *blob;
        uint32_t checksum;
+       size_t map_size;
 
        blob     = batmap->map;
        checksum = 0;
 
-       n = vhd_sectors_to_bytes(batmap->header.batmap_size);
-
-       for (i = 0; i < n; i++) {
+       map_size =
+               vhd_sectors_to_bytes(secs_round_up_no_zero
+                                                        (ctx->footer.
+                                                         curr_size >> 
(VHD_BLOCK_SHIFT + 3)));
+
+       for (i = 0; i < map_size; i++) {
                if (batmap->header.batmap_version == VHD_BATMAP_VERSION(1, 1))
                        checksum += (uint32_t)blob[i];
                else
@@ -389,14 +393,14 @@ vhd_validate_batmap_header(vhd_batmap_t 
 }
 
 int
-vhd_validate_batmap(vhd_batmap_t *batmap)
+vhd_validate_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
        uint32_t checksum;
 
        if (!batmap->map)
                return -EINVAL;
 
-       checksum = vhd_checksum_batmap(batmap);
+       checksum = vhd_checksum_batmap(ctx, batmap);
        if (checksum != batmap->header.checksum)
                return -EINVAL;
 
@@ -404,9 +408,9 @@ vhd_validate_batmap(vhd_batmap_t *batmap
 }
 
 int
-vhd_batmap_header_offset(vhd_context_t *ctx, off_t *_off)
+vhd_batmap_header_offset(vhd_context_t *ctx, off64_t *_off)
 {
-       off_t off;
+       off64_t off;
        size_t  bat;
 
        *_off = 0;
@@ -593,11 +597,11 @@ vhd_bitmap_clear(vhd_context_t *ctx, cha
  * byte of the file which is not vhd metadata
  */
 int
-vhd_end_of_headers(vhd_context_t *ctx, off_t *end)
+vhd_end_of_headers(vhd_context_t *ctx, off64_t *end)
 {
        int err, i, n;
        uint32_t bat_bytes;
-       off_t eom, bat_end;
+       off64_t eom, bat_end;
        vhd_parent_locator_t *loc;
 
        *end = 0;
@@ -613,7 +617,7 @@ vhd_end_of_headers(vhd_context_t *ctx, o
        eom       = MAX(eom, bat_end);
 
        if (vhd_has_batmap(ctx)) {
-               off_t hdr_end, hdr_secs, map_end, map_secs;
+               off64_t hdr_end, hdr_secs, map_end, map_secs;
 
                err = vhd_get_batmap(ctx);
                if (err)
@@ -637,7 +641,7 @@ vhd_end_of_headers(vhd_context_t *ctx, o
        n = sizeof(ctx->header.loc) / sizeof(vhd_parent_locator_t);
 
        for (i = 0; i < n; i++) {
-               off_t loc_end;
+               off64_t loc_end;
 
                loc = &ctx->header.loc[i];
                if (loc->code == PLAT_CODE_NONE)
@@ -652,10 +656,10 @@ vhd_end_of_headers(vhd_context_t *ctx, o
 }
 
 int
-vhd_end_of_data(vhd_context_t *ctx, off_t *end)
+vhd_end_of_data(vhd_context_t *ctx, off64_t *end)
 {
        int i, err;
-       off_t max;
+       off64_t max;
        uint64_t blk;
 
        if (!vhd_type_dynamic(ctx)) {
@@ -664,7 +668,7 @@ vhd_end_of_data(vhd_context_t *ctx, off_
                        return err;
 
                max = vhd_position(ctx);
-               if (max == (off_t)-1)
+               if (max == (off64_t) - 1)
                        return -errno;
 
                *end = max - sizeof(vhd_footer_t);
@@ -822,7 +826,7 @@ vhd_get_batmap(vhd_context_t *ctx)
        if (!vhd_has_batmap(ctx))
                return -EINVAL;
 
-       if (!vhd_validate_batmap(&ctx->batmap))
+       if (!vhd_validate_batmap(ctx, &ctx->batmap))
                return 0;
 
        vhd_put_batmap(ctx);
@@ -871,8 +875,8 @@ int
 vhd_read_short_footer(vhd_context_t *ctx, vhd_footer_t *footer)
 {
        int err;
-       char *buf;
-       off_t eof;
+       void *buf;
+       off64_t eof;
 
        buf = NULL;
 
@@ -881,7 +885,7 @@ vhd_read_short_footer(vhd_context_t *ctx
                goto out;
 
        eof = vhd_position(ctx);
-       if (eof == (off_t)-1) {
+       if (eof == (off64_t) - 1) {
                err = -errno;
                goto out;
        }
@@ -890,7 +894,7 @@ vhd_read_short_footer(vhd_context_t *ctx
        if (err)
                goto out;
 
-       err = posix_memalign((void **)&buf,
+       err = posix_memalign(&buf,
                             VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
        if (err) {
                buf = NULL;
@@ -919,10 +923,10 @@ out:
 }
 
 int
-vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+vhd_read_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
 {
+       void *buf;
        int err;
-       char *buf;
 
        buf = NULL;
 
@@ -930,7 +934,7 @@ vhd_read_footer_at(vhd_context_t *ctx, v
        if (err)
                goto out;
 
-       err = posix_memalign((void **)&buf,
+       err = posix_memalign(&buf,
                             VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
        if (err) {
                buf = NULL;
@@ -959,14 +963,14 @@ int
 vhd_read_footer(vhd_context_t *ctx, vhd_footer_t *footer)
 {
        int err;
-       off_t off;
+       off64_t off;
 
        err = vhd_seek(ctx, 0, SEEK_END);
        if (err)
                return err;
 
        off = vhd_position(ctx);
-       if (off == (off_t)-1)
+       if (off == (off64_t) - 1)
                return -errno;
 
        err = vhd_read_footer_at(ctx, footer, off - 512);
@@ -977,17 +981,22 @@ vhd_read_footer(vhd_context_t *ctx, vhd_
        if (err != -EINVAL)
                return err;
 
-       if (ctx->oflags & VHD_OPEN_STRICT)
-               return -EINVAL;
+       /* 
+        * Disable the enforcement of VHD_OPEN_STRICT until we figure out how 
+        * to recover from crashes. Note that we never enforced it before 
+        * anyways due to a bug (CA-28285) and everything was ok.
+        */
+       /* if (ctx->oflags & VHD_OPEN_STRICT)
+          return -EINVAL; */
 
        return vhd_read_footer_at(ctx, footer, 0);
 }
 
 int
-vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+vhd_read_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
 {
+       void *buf;
        int err;
-       char *buf;
 
        buf = NULL;
 
@@ -1000,7 +1009,7 @@ vhd_read_header_at(vhd_context_t *ctx, v
        if (err)
                goto out;
 
-       err = posix_memalign((void **)&buf,
+       err = posix_memalign(&buf,
                             VHD_SECTOR_SIZE, sizeof(vhd_header_t));
        if (err) {
                buf = NULL;
@@ -1028,8 +1037,7 @@ out:
 int
 vhd_read_header(vhd_context_t *ctx, vhd_header_t *header)
 {
-       int err;
-       off_t off;
+       off64_t off;
 
        if (!vhd_type_dynamic(ctx)) {
                VHDLOG("%s is not dynamic!\n", ctx->file);
@@ -1044,8 +1052,9 @@ int
 vhd_read_bat(vhd_context_t *ctx, vhd_bat_t *bat)
 {
        int err;
-       char *buf;
-       off_t off;
+       void *buf;
+       off64_t off;
+       uint32_t vhd_blks;
        size_t size;
 
        buf  = NULL;
@@ -1056,9 +1065,14 @@ vhd_read_bat(vhd_context_t *ctx, vhd_bat
        }
 
        off  = ctx->header.table_offset;
-       size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
-
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       /* The BAT size is stored in ctx->header.max_bat_size. However, we
+        * sometimes preallocate BAT + batmap for max VHD size, so only read in
+        * the BAT entries that are in use for curr_size */
+       vhd_blks = ctx->footer.curr_size >> VHD_BLOCK_SHIFT;
+       ASSERT(ctx->header.max_bat_size >= vhd_blks);
+       size = vhd_bytes_padded(vhd_blks * sizeof(uint32_t));
+
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err) {
                buf = NULL;
                err = -err;
@@ -1074,7 +1088,7 @@ vhd_read_bat(vhd_context_t *ctx, vhd_bat
                goto fail;
 
        bat->spb     = ctx->header.block_size >> VHD_SECTOR_SHIFT;
-       bat->entries = ctx->header.max_bat_size;
+       bat->entries = vhd_blks;
        bat->bat     = (uint32_t *)buf;
 
        vhd_bat_in(bat);
@@ -1092,8 +1106,8 @@ static int
 vhd_read_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
        int err;
-       char *buf;
-       off_t off;
+       void *buf;
+       off64_t off;
        size_t size;
 
        buf = NULL;
@@ -1107,7 +1121,7 @@ vhd_read_batmap_header(vhd_context_t *ct
                goto fail;
 
        size = vhd_bytes_padded(sizeof(vhd_batmap_header_t));
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err) {
                buf = NULL;
                err = -err;
@@ -1137,13 +1151,16 @@ static int
 vhd_read_batmap_map(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
        int err;
-       char *buf;
-       off_t off;
+       void *buf;
+       off64_t off;
        size_t map_size;
 
-       map_size = vhd_sectors_to_bytes(batmap->header.batmap_size);
-
-       err = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, map_size);
+       map_size = vhd_sectors_to_bytes(secs_round_up_no_zero
+                                                        (ctx->footer.
+                                                         curr_size >> 
(VHD_BLOCK_SHIFT + 3)));
+       ASSERT(vhd_sectors_to_bytes(batmap->header.batmap_size) >= map_size);
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, map_size);
        if (err) {
                buf = NULL;
                err = -err;
@@ -1191,7 +1208,7 @@ vhd_read_batmap(vhd_context_t *ctx, vhd_
        if (err)
                return err;
 
-       err = vhd_validate_batmap(batmap);
+       err = vhd_validate_batmap(ctx, batmap);
        if (err)
                goto fail;
 
@@ -1252,8 +1269,10 @@ vhd_test_file_fixed(const char *file, in
 int
 vhd_find_parent(vhd_context_t *ctx, const char *parent, char **_location)
 {
+       char *location, __location[PATH_MAX];
+       char *cpath, __cpath[PATH_MAX];
+       char *cdir, *path;
        int err;
-       char *location, *cpath, *cdir, *path;
 
        err        = 0;
        path       = NULL;
@@ -1266,16 +1285,15 @@ vhd_find_parent(vhd_context_t *ctx, cons
 
        if (parent[0] == '/') {
                if (!access(parent, R_OK)) {
-                       path = strdup(parent);
-                       if (!path)
-                               return -ENOMEM;
-                       *_location = path;
+                       *_location = strdup(parent);
+                       if (!*_location)
+                               return -errno;
                        return 0;
                }
        }
 
        /* check parent path relative to child's directory */
-       cpath = realpath(ctx->file, NULL);
+       cpath = realpath(ctx->file, __cpath);
        if (!cpath) {
                err = -errno;
                goto out;
@@ -1289,28 +1307,27 @@ vhd_find_parent(vhd_context_t *ctx, cons
        }
 
        if (!access(location, R_OK)) {
-               path = realpath(location, NULL);
+               path = realpath(location, __location);
                if (path) {
-                       *_location = path;
-                       return 0;
+                       *_location = strdup(path);
+                       if (*_location)
+                               goto out;
                }
        }
        err = -errno;
 
 out:
        free(location);
-       free(cpath);
        return err;
 }
 
-static int 
+int
 vhd_macx_encode_location(char *name, char **out, int *outlen)
 {
        iconv_t cd;
        int len, err;
        size_t ibl, obl;
-       char *uri, *uri_utf8, *uri_utf8p, *ret;
-       const char *urip;
+       char *uri, *urip, *uri_utf8, *uri_utf8p, *ret;
 
        err     = 0;
        ret     = NULL;
@@ -1321,7 +1338,7 @@ vhd_macx_encode_location(char *name, cha
        ibl     = len;
        obl     = len;
 
-       urip = uri = malloc(ibl + 1);
+       uri = urip = malloc(ibl + 1);
        uri_utf8 = uri_utf8p = malloc(obl);
 
        if (!uri || !uri_utf8)
@@ -1333,14 +1350,10 @@ vhd_macx_encode_location(char *name, cha
                goto out;
        }
 
-       snprintf(uri, ibl+1, "file://%s", name);
-
-       if (iconv(cd,
-#ifdef __linux__
-           (char **)
-#endif
-           &urip, &ibl, &uri_utf8p, &obl) == (size_t)-1 ||
-           ibl || obl) {
+       sprintf(uri, "file://%s", name);
+
+       if (iconv(cd, &urip, &ibl, &uri_utf8p, &obl) == (size_t) - 1 ||
+               ibl || obl) {
                err = (errno ? -errno : -EIO);
                goto out;
        }
@@ -1364,14 +1377,13 @@ vhd_macx_encode_location(char *name, cha
        return err;
 }
 
-static int
+int
 vhd_w2u_encode_location(char *name, char **out, int *outlen)
 {
        iconv_t cd;
        int len, err;
        size_t ibl, obl;
-       char *uri, *uri_utf16, *uri_utf16p, *tmp, *ret;
-       const char *urip;
+       char *uri, *urip, *uri_utf16, *uri_utf16p, *tmp, *ret;
 
        err     = 0;
        ret     = NULL;
@@ -1425,12 +1437,8 @@ vhd_w2u_encode_location(char *name, char
                goto out;
        }
 
-       if (iconv(cd,
-#ifdef __linux__
-           (char **)
-#endif
-           &urip, &ibl, &uri_utf16p, &obl) == (size_t)-1 ||
-           ibl || obl) {
+       if (iconv(cd, &urip, &ibl, &uri_utf16p, &obl) == (size_t) - 1 ||
+               ibl || obl) {
                err = (errno ? -errno : -EIO);
                goto out;
        }
@@ -1457,7 +1465,7 @@ vhd_w2u_encode_location(char *name, char
 }
 
 static char *
-vhd_macx_decode_location(const char *in, char *out, int len)
+vhd_macx_decode_location(char *in, char *out, int len)
 {
        iconv_t cd;
        char *name;
@@ -1470,11 +1478,7 @@ vhd_macx_decode_location(const char *in,
        if (cd == (iconv_t)-1) 
                return NULL;
 
-       if (iconv(cd,
-#ifdef __linux__
-               (char **)
-#endif
-               &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+       if (iconv(cd, &in, &ibl, &out, &obl) == (size_t) - 1 || ibl)
                return NULL;
 
        iconv_close(cd);
@@ -1489,7 +1493,7 @@ vhd_macx_decode_location(const char *in,
 }
 
 static char *
-vhd_w2u_decode_location(const char *in, char *out, int len, char *utf_type)
+vhd_w2u_decode_location(char *in, char *out, int len, char *utf_type)
 {
        iconv_t cd;
        char *name, *tmp;
@@ -1502,11 +1506,7 @@ vhd_w2u_decode_location(const char *in, 
        if (cd == (iconv_t)-1) 
                return NULL;
 
-       if (iconv(cd,
-#ifdef __linux__
-               (char **)
-#endif
-               &in, &ibl, &out, &obl) == (size_t)-1 || ibl)
+       if (iconv(cd, &in, &ibl, &out, &obl) == (size_t) - 1 || ibl)
                return NULL;
 
        iconv_close(cd);
@@ -1545,7 +1545,7 @@ vhd_parent_locator_read(vhd_context_t *c
                        vhd_parent_locator_t *loc, char **parent)
 {
        int err, size;
-       char *raw, *out, *name;
+       void *raw, *out, *name;
 
        raw     = NULL;
        out     = NULL;
@@ -1577,7 +1577,7 @@ vhd_parent_locator_read(vhd_context_t *c
                goto out;
        }
 
-       err = posix_memalign((void **)&raw, VHD_SECTOR_SIZE, size);
+       err = posix_memalign(&raw, VHD_SECTOR_SIZE, size);
        if (err) {
                raw = NULL;
                err = -err;
@@ -1635,7 +1635,7 @@ vhd_parent_locator_get(vhd_context_t *ct
        char *name, *location;
        vhd_parent_locator_t *loc;
 
-       err     = 0;
+       err     = -EINVAL;
        *parent = NULL;
 
        if (ctx->footer.type != HD_TYPE_DIFF)
@@ -1643,9 +1643,11 @@ vhd_parent_locator_get(vhd_context_t *ct
 
        n = vhd_parent_locator_count(ctx);
        for (i = 0; i < n; i++) {
+               int _err;
+
                loc = ctx->header.loc + i;
-               err = vhd_parent_locator_read(ctx, loc, &name);
-               if (err)
+               _err = vhd_parent_locator_read(ctx, loc, &name);
+               if (_err)
                        continue;
 
                err = vhd_find_parent(ctx, name, &location);
@@ -1665,12 +1667,14 @@ vhd_parent_locator_get(vhd_context_t *ct
 
 int
 vhd_parent_locator_write_at(vhd_context_t *ctx,
-                           const char *parent, off_t off, uint32_t code,
+                           const char *parent, off64_t off, uint32_t code,
                            size_t max_bytes, vhd_parent_locator_t *loc)
 {
        struct stat stats;
        int err, len, size;
-       char *absolute_path, *relative_path, *encoded, *block;
+       char *absolute_path, *relative_path, *encoded;
+       char __parent[PATH_MAX];
+       void *block;
 
        memset(loc, 0, sizeof(vhd_parent_locator_t));
 
@@ -1693,7 +1697,7 @@ vhd_parent_locator_write_at(vhd_context_
                return -EINVAL;
        }
 
-       absolute_path = realpath(parent, NULL);
+       absolute_path = realpath(parent, __parent);
        if (!absolute_path) {
                err = -errno;
                goto out;
@@ -1742,7 +1746,7 @@ vhd_parent_locator_write_at(vhd_context_
                goto out;
        }
 
-       err  = posix_memalign((void **)&block, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&block, VHD_SECTOR_SIZE, size);
        if (err) {
                block = NULL;
                err   = -err;
@@ -1759,7 +1763,6 @@ vhd_parent_locator_write_at(vhd_context_
        err = 0;
 
 out:
-       free(absolute_path);
        free(relative_path);
        free(encoded);
        free(block);
@@ -1781,7 +1784,7 @@ out:
 }
 
 static int
-vhd_footer_offset_at_eof(vhd_context_t *ctx, off_t *off)
+vhd_footer_offset_at_eof(vhd_context_t *ctx, off64_t *off)
 {
        int err;
        if ((err = vhd_seek(ctx, 0, SEEK_END)))
@@ -1794,9 +1797,9 @@ int
 vhd_read_bitmap(vhd_context_t *ctx, uint32_t block, char **bufp)
 {
        int err;
-       char *buf;
+       void *buf;
        size_t size;
-       off_t off;
+       off64_t off;
        uint64_t blk;
 
        buf   = NULL;
@@ -1823,7 +1826,7 @@ vhd_read_bitmap(vhd_context_t *ctx, uint
        if (err)
                return err;
 
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err)
                return -err;
 
@@ -1843,10 +1846,10 @@ int
 vhd_read_block(vhd_context_t *ctx, uint32_t block, char **bufp)
 {
        int err;
-       char *buf;
+       void *buf;
        size_t size;
        uint64_t blk;
-       off_t end, off;
+       off64_t end, off;
 
        buf   = NULL;
        *bufp = NULL;
@@ -1872,7 +1875,7 @@ vhd_read_block(vhd_context_t *ctx, uint3
        if (err)
                return err;
 
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err) {
                err = -err;
                goto fail;
@@ -1900,20 +1903,21 @@ fail:
 }
 
 int
-vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off_t off)
+vhd_write_footer_at(vhd_context_t *ctx, vhd_footer_t *footer, off64_t off)
 {
        int err;
+       void *buf;
        vhd_footer_t *f;
 
        f = NULL;
 
-       err = posix_memalign((void **)&f,
+       err = posix_memalign(&buf,
                             VHD_SECTOR_SIZE, sizeof(vhd_footer_t));
        if (err) {
-               f   = NULL;
                err = -err;
                goto out;
        }
+       f     = buf;
 
        memcpy(f, footer, sizeof(vhd_footer_t));
        f->checksum = vhd_checksum_footer(f);
@@ -1942,7 +1946,7 @@ int
 vhd_write_footer(vhd_context_t *ctx, vhd_footer_t *footer)
 {
        int err;
-       off_t off;
+       off64_t off;
 
        if (ctx->is_block)
                err = vhd_footer_offset_at_eof(ctx, &off);
@@ -1955,6 +1959,12 @@ vhd_write_footer(vhd_context_t *ctx, vhd
        if (err)
                return err;
 
+       if (!ctx->is_block) {
+               err = ftruncate(ctx->fd, off + sizeof(vhd_footer_t));
+               if (err)
+                       return -errno;
+       }
+
        if (!vhd_type_dynamic(ctx))
                return 0;
 
@@ -1962,10 +1972,11 @@ vhd_write_footer(vhd_context_t *ctx, vhd
 }
 
 int
-vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off_t off)
+vhd_write_header_at(vhd_context_t *ctx, vhd_header_t *header, off64_t off)
 {
        int err;
        vhd_header_t *h;
+       void *buf;
 
        h = NULL;
 
@@ -1974,13 +1985,13 @@ vhd_write_header_at(vhd_context_t *ctx, 
                goto out;
        }
 
-       err = posix_memalign((void **)&h,
+       err = posix_memalign(&buf,
                             VHD_SECTOR_SIZE, sizeof(vhd_header_t));
        if (err) {
-               h   = NULL;
                err = -err;
                goto out;
        }
+       h     = buf;
 
        memcpy(h, header, sizeof(vhd_header_t));
 
@@ -2008,8 +2019,7 @@ out:
 int
 vhd_write_header(vhd_context_t *ctx, vhd_header_t *header)
 {
-       int err;
-       off_t off;
+       off64_t off;
 
        if (!vhd_type_dynamic(ctx))
                return -EINVAL;
@@ -2022,8 +2032,9 @@ int
 vhd_write_bat(vhd_context_t *ctx, vhd_bat_t *bat)
 {
        int err;
-       off_t off;
+       off64_t off;
        vhd_bat_t b;
+       void *buf;
        size_t size;
 
        if (!vhd_type_dynamic(ctx))
@@ -2046,9 +2057,10 @@ vhd_write_bat(vhd_context_t *ctx, vhd_ba
        if (err)
                return err;
 
-       err  = posix_memalign((void **)&b.bat, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err)
                return -err;
+       b.bat = buf;
 
        memcpy(b.bat, bat->bat, size);
        b.spb     = bat->spb;
@@ -2061,13 +2073,50 @@ vhd_write_bat(vhd_context_t *ctx, vhd_ba
        return err;
 }
 
+static int
+vhd_write_batmap_header(vhd_context_t *ctx, vhd_batmap_t *batmap)
+{
+       int err;
+       size_t size;
+       off64_t off;
+       void *buf = NULL;
+
+       err = vhd_batmap_header_offset(ctx, &off);
+       if (err)
+               goto out;
+
+       size = vhd_bytes_padded(sizeof(*batmap));
+
+       err = vhd_seek(ctx, off, SEEK_SET);
+       if (err)
+               goto out;
+
+       err = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
+       if (err) {
+               err = -err;
+               goto out;
+       }
+
+       vhd_batmap_header_out(batmap);
+       memset(buf, 0, size);
+       memcpy(buf, &batmap->header, sizeof(batmap->header));
+
+       err = vhd_write(ctx, buf, size);
+
+  out:
+       if (err)
+               VHDLOG("%s: failed writing batmap: %d\n", ctx->file, err);
+       free(buf);
+       return err;
+}
+
 int
 vhd_write_batmap(vhd_context_t *ctx, vhd_batmap_t *batmap)
 {
        int err;
-       off_t off;
+       off64_t off;
        vhd_batmap_t b;
-       char *buf, *map;
+       void *buf, *map;
        size_t size, map_size;
 
        buf      = NULL;
@@ -2081,19 +2130,22 @@ vhd_write_batmap(vhd_context_t *ctx, vhd
        b.header = batmap->header;
        b.map    = batmap->map;
 
-       b.header.checksum = vhd_checksum_batmap(&b);
-       err = vhd_validate_batmap(&b);
+       b.header.checksum = vhd_checksum_batmap(ctx, &b);
+       err = vhd_validate_batmap(ctx, &b);
        if (err)
                goto out;
 
        off      = b.header.batmap_offset;
-       map_size = vhd_sectors_to_bytes(b.header.batmap_size);
+       map_size = vhd_sectors_to_bytes(secs_round_up_no_zero
+                                                        (ctx->footer.
+                                                         curr_size >> 
(VHD_BLOCK_SHIFT + 3)));
+       ASSERT(vhd_sectors_to_bytes(b.header.batmap_size) >= map_size);
 
        err  = vhd_seek(ctx, off, SEEK_SET);
        if (err)
                goto out;
 
-       err  = posix_memalign((void **)&map, VHD_SECTOR_SIZE, map_size);
+       err  = posix_memalign(&map, VHD_SECTOR_SIZE, map_size);
        if (err) {
                map = NULL;
                err = -err;
@@ -2116,7 +2168,7 @@ vhd_write_batmap(vhd_context_t *ctx, vhd
        if (err)
                goto out;
 
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err) {
                err = -err;
                buf = NULL;
@@ -2141,9 +2193,9 @@ int
 vhd_write_bitmap(vhd_context_t *ctx, uint32_t block, char *bitmap)
 {
        int err;
-       off_t off;
+       off64_t off;
        uint64_t blk;
-       size_t secs, size;
+       size_t size;
 
        if (!vhd_type_dynamic(ctx))
                return -EINVAL;
@@ -2180,7 +2232,7 @@ int
 vhd_write_block(vhd_context_t *ctx, uint32_t block, char *data)
 {
        int err;
-       off_t off;
+       off64_t off;
        size_t size;
        uint64_t blk;
 
@@ -2230,13 +2282,79 @@ namedup(char **dup, const char *name)
        return 0;
 }
 
+#define vwrite (ssize_t (*)(int, void *, size_t))write
+#define vpwrite (ssize_t (*)(int, void *, size_t, off_t))pwrite
+
+static ssize_t
+vhd_atomic_pio(ssize_t(*f) (int, void *, size_t, off_t),
+                          int fd, void *_s, size_t n, off_t off)
+{
+       char *s = _s;
+       size_t pos = 0;
+       ssize_t res;
+       struct stat st;
+
+       memset(&st, 0, sizeof(st));
+
+       for (;;) {
+               res = (f) (fd, s + pos, n - pos, off + pos);
+               switch (res) {
+               case -1:
+                       if (errno == EINTR || errno == EAGAIN)
+                               continue;
+                       else
+                               return 0;
+                       break;
+               case 0:
+                       errno = EPIPE;
+                       return pos;
+               }
+
+               if (pos + res == n)
+                       return n;
+
+               if (!st.st_size)
+                       if (fstat(fd, &st) == -1)
+                               return -1;
+
+               if (off + pos + res == st.st_size)
+                       return pos + res;
+
+               pos += (res & ~(VHD_SECTOR_SIZE - 1));
+       }
+
+       return -1;
+}
+
+static ssize_t
+vhd_atomic_io(ssize_t(*f) (int, void *, size_t), int fd, void *_s,
+                         size_t n)
+{
+       off64_t off;
+       ssize_t res;
+       ssize_t(*pf) (int, void *, size_t, off_t);
+
+       off = lseek64(fd, 0, SEEK_CUR);
+       if (off == (off_t) - 1)
+               return -1;
+
+       pf = (f == read ? pread : vpwrite);
+       res = vhd_atomic_pio(pf, fd, _s, n, off);
+
+       if (res > 0)
+               if (lseek64(fd, off + res, SEEK_SET) == (off64_t) - 1)
+                       return -1;
+
+       return res;
+}
+
 int
-vhd_seek(vhd_context_t *ctx, off_t offset, int whence)
+vhd_seek(vhd_context_t *ctx, off64_t offset, int whence)
 {
-       off_t off;
-
-       off = lseek(ctx->fd, offset, whence);
-       if (off == (off_t)-1) {
+       off64_t off;
+
+       off = lseek64(ctx->fd, offset, whence);
+       if (off == (off64_t)-1) {
                VHDLOG("%s: seek(0x%08"PRIx64", %d) failed: %d\n",
                       ctx->file, offset, whence, -errno);
                return -errno;
@@ -2245,10 +2363,10 @@ vhd_seek(vhd_context_t *ctx, off_t offse
        return 0;
 }
 
-off_t
+off64_t
 vhd_position(vhd_context_t *ctx)
 {
-       return lseek(ctx->fd, 0, SEEK_CUR);
+       return lseek64(ctx->fd, 0, SEEK_CUR);
 }
 
 int
@@ -2258,7 +2376,7 @@ vhd_read(vhd_context_t *ctx, void *buf, 
 
        errno = 0;
 
-       ret = read(ctx->fd, buf, size);
+       ret = vhd_atomic_io(read, ctx->fd, buf, size);
        if (ret == size)
                return 0;
 
@@ -2275,7 +2393,7 @@ vhd_write(vhd_context_t *ctx, void *buf,
 
        errno = 0;
 
-       ret = write(ctx->fd, buf, size);
+       ret = vhd_atomic_io(vwrite, ctx->fd, buf, size);
        if (ret == size)
                return 0;
 
@@ -2285,6 +2403,40 @@ vhd_write(vhd_context_t *ctx, void *buf,
        return (errno ? -errno : -EIO);
 }
 
+static int
+vhd_pread(vhd_context_t * ctx, void *buf, size_t size, off64_t offset)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = vhd_atomic_pio(pread, ctx->fd, buf, size, offset);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: pread of %zu returned %zd, errno: %d\n",
+                  ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
+static int
+vhd_pwrite(vhd_context_t * ctx, void *buf, size_t size, off64_t offset)
+{
+       ssize_t ret;
+
+       errno = 0;
+
+       ret = vhd_atomic_pio(vpwrite, ctx->fd, buf, size, offset);
+       if (ret == size)
+               return 0;
+
+       VHDLOG("%s: pwrite of %zu returned %zd, errno: %d\n",
+                  ctx->file, size, ret, -errno);
+
+       return (errno ? -errno : -EIO);
+}
+
 int
 vhd_offset(vhd_context_t *ctx, uint32_t sector, uint32_t *offset)
 {
@@ -2312,11 +2464,11 @@ int
 vhd_open_fast(vhd_context_t *ctx)
 {
        int err;
-       char *buf;
+       void *buf;
        size_t size;
 
        size = sizeof(vhd_footer_t) + sizeof(vhd_header_t);
-       err  = posix_memalign((void **)&buf, VHD_SECTOR_SIZE, size);
+       err  = posix_memalign(&buf, VHD_SECTOR_SIZE, size);
        if (err) {
                VHDLOG("failed allocating %s: %d\n", ctx->file, -err);
                return -err;
@@ -2360,12 +2512,14 @@ out:
 int
 vhd_open(vhd_context_t *ctx, const char *file, int flags)
 {
-       int err, oflags;
+       int i, err, oflags;
 
        if (flags & VHD_OPEN_STRICT)
                vhd_flag_clear(flags, VHD_OPEN_FAST);
 
        memset(ctx, 0, sizeof(vhd_context_t));
+       vhd_cache_init(ctx);
+
        ctx->fd     = -1;
        ctx->oflags = flags;
 
@@ -2373,7 +2527,9 @@ vhd_open(vhd_context_t *ctx, const char 
        if (err)
                return err;
 
-       oflags = O_DIRECT | O_LARGEFILE;
+       oflags = O_LARGEFILE;
+       if (!(flags & VHD_OPEN_CACHED))
+               oflags |= O_DIRECT;
        if (flags & VHD_OPEN_RDONLY)
                oflags |= O_RDONLY;
        if (flags & VHD_OPEN_RDWR)
@@ -2408,7 +2564,13 @@ vhd_open(vhd_context_t *ctx, const char 
        }
 
        if (vhd_type_dynamic(ctx)) {
-               err = vhd_read_header(ctx, &ctx->header);
+               for (i = 0; i < VHD_HEADER_MAX_RETRIES; i++) {
+                       err = vhd_read_header(ctx, &ctx->header);
+                       if (!err)
+                               break;
+                       VHDLOG("Error reading header, retry %d\n", i);
+                       sleep(1);
+               }
                if (err)
                        goto fail;
 
@@ -2416,6 +2578,12 @@ vhd_open(vhd_context_t *ctx, const char 
                ctx->bm_secs = secs_round_up_no_zero(ctx->spb >> 3);
        }
 
+       err = vhd_cache_load(ctx);
+       if (err) {
+               VHDLOG("failed to load cache: %d\n", err);
+               goto fail;
+       }
+
        return 0;
 
 fail:
@@ -2429,8 +2597,14 @@ fail:
 void
 vhd_close(vhd_context_t *ctx)
 {
+       vhd_cache_unload(ctx);
+
        if (ctx->file)
+       {
+               fsync(ctx->fd);
                close(ctx->fd);
+       }
+
        free(ctx->file);
        free(ctx->bat.bat);
        free(ctx->batmap.map);
@@ -2452,19 +2626,18 @@ vhd_initialize_footer(vhd_context_t *ctx
        ctx->footer.geometry     = vhd_chs(size);
        ctx->footer.type         = type;
        ctx->footer.saved        = 0;
-       ctx->footer.data_offset  = 0xFFFFFFFFFFFFFFFF;
+       ctx->footer.data_offset  = 0xFFFFFFFFFFFFFFFFULL;
        strcpy(ctx->footer.crtr_app, "tap");
-       vhd_uuid_generate(&ctx->footer.uuid);
+       uuid_generate(ctx->footer.uuid);
 }
 
-static int
+int
 vhd_initialize_header_parent_name(vhd_context_t *ctx, const char *parent_path)
 {
        int err;
        iconv_t cd;
        size_t ibl, obl;
-       char *ppath, *dst;
-       const char *pname;
+       char *pname, *ppath, *dst;
 
        err   = 0;
        pname = NULL;
@@ -2498,11 +2671,7 @@ vhd_initialize_header_parent_name(vhd_co
 
        memset(dst, 0, obl);
 
-       if (iconv(cd,
-#ifdef __linux__
-               (char **)
-#endif
-               &pname, &ibl, &dst, &obl) == (size_t)-1 || ibl)
+       if (iconv(cd, &pname, &ibl, &dst, &obl) == (size_t) - 1 || ibl)
                err = (errno ? -errno : -EINVAL);
 
 out:
@@ -2511,25 +2680,25 @@ out:
        return err;
 }
 
-static off_t
+static off64_t
 get_file_size(const char *name)
 {
        int fd;
-       off_t end;
+       off64_t end;
 
        fd = open(name, O_LARGEFILE | O_RDONLY);
        if (fd == -1) {
                VHDLOG("unable to open '%s': %d\n", name, errno);
                return -errno;
        }
-       end = lseek(fd, 0, SEEK_END);
+       end = lseek64(fd, 0, SEEK_END);
        close(fd); 
        return end;
 }
 
 static int
 vhd_initialize_header(vhd_context_t *ctx, const char *parent_path, 
-               uint64_t size, int raw)
+               uint64_t size, int raw, uint64_t *psize)
 {
        int err;
        struct stat stats;
@@ -2560,20 +2729,26 @@ vhd_initialize_header(vhd_context_t *ctx
 
        if (raw) {
                ctx->header.prt_ts = vhd_time(stats.st_mtime);
+               *psize = get_file_size(parent_path);
                if (!size)
-                       size = get_file_size(parent_path);
-       }
-       else {
+                       size = *psize;
+       } else {
                err = vhd_open(&parent, parent_path, VHD_OPEN_RDONLY);
                if (err)
                        return err;
 
                ctx->header.prt_ts = vhd_time(stats.st_mtime);
-               vhd_uuid_copy(&ctx->header.prt_uuid, &parent.footer.uuid);
+               uuid_copy(ctx->header.prt_uuid, parent.footer.uuid);
+               *psize = parent.footer.curr_size;
                if (!size)
-                       size = parent.footer.curr_size;
+                       size = *psize;
                vhd_close(&parent);
        }
+       if (size < *psize) {
+               VHDLOG("snapshot size (%" PRIu64 ") < parent size (%" PRIu64 
")\n",
+                          size, *psize);
+               return -EINVAL;
+       }
        ctx->footer.orig_size    = size;
        ctx->footer.curr_size    = size;
        ctx->footer.geometry     = vhd_chs(size);
@@ -2583,11 +2758,11 @@ vhd_initialize_header(vhd_context_t *ctx
        return vhd_initialize_header_parent_name(ctx, parent_path);
 }
 
-static int
+int
 vhd_write_parent_locators(vhd_context_t *ctx, const char *parent)
 {
        int i, err;
-       off_t off;
+       off64_t off;
        uint32_t code;
 
        code = PLAT_CODE_NONE;
@@ -2631,8 +2806,9 @@ vhd_change_parent(vhd_context_t *child, 
        char *ppath;
        struct stat stats;
        vhd_context_t parent;
-
-       ppath = realpath(parent_path, NULL);
+       char __parent_path[PATH_MAX];
+
+       ppath = realpath(parent_path, __parent_path);
        if (!ppath) {
                VHDLOG("error resolving parent path %s for %s: %d\n",
                       parent_path, child->file, errno);
@@ -2651,7 +2827,7 @@ vhd_change_parent(vhd_context_t *child, 
        }
 
        if (raw) {
-               vhd_uuid_clear(&child->header.prt_uuid);
+               uuid_clear(child->header.prt_uuid);
        } else {
                err = vhd_open(&parent, ppath, VHD_OPEN_RDONLY);
                if (err) {
@@ -2659,7 +2835,7 @@ vhd_change_parent(vhd_context_t *child, 
                               ppath, child->file, err);
                        goto out;
                }
-               vhd_uuid_copy(&child->header.prt_uuid, &parent.footer.uuid);
+               uuid_copy(child->header.prt_uuid, parent.footer.uuid);
                vhd_close(&parent);
        }
 
@@ -2700,16 +2876,16 @@ vhd_change_parent(vhd_context_t *child, 
        err = 0;
 
 out:
-       free(ppath);
        return err;
 }
 
 static int
 vhd_create_batmap(vhd_context_t *ctx)
 {
-       off_t off;
+       off64_t off;
        int err, map_bytes;
        vhd_batmap_header_t *header;
+       void *map;
 
        if (!vhd_type_dynamic(ctx))
                return -EINVAL;
@@ -2731,14 +2907,13 @@ vhd_create_batmap(vhd_context_t *ctx)
 
        map_bytes = vhd_sectors_to_bytes(header->batmap_size);
 
-       err = posix_memalign((void **)&ctx->batmap.map,
+       err = posix_memalign(&map,
                             VHD_SECTOR_SIZE, map_bytes);
-       if (err) {
-               ctx->batmap.map = NULL;
+       if (err)
                return -err;
-       }
-
-       memset(ctx->batmap.map, 0, map_bytes);
+
+       memset(map, 0, map_bytes);
+       ctx->batmap.map = map;
 
        return vhd_write_batmap(ctx, &ctx->batmap);
 }
@@ -2748,16 +2923,17 @@ vhd_create_bat(vhd_context_t *ctx)
 {
        int i, err;
        size_t size;
+       void *bat;
 
        if (!vhd_type_dynamic(ctx))
                return -EINVAL;
 
        size = vhd_bytes_padded(ctx->header.max_bat_size * sizeof(uint32_t));
-       err  = posix_memalign((void **)&ctx->bat.bat, VHD_SECTOR_SIZE, size);
-       if (err) {
-               ctx->bat.bat = NULL;
+       err  = posix_memalign(&bat, VHD_SECTOR_SIZE, size);
+       if (err)
                return err;
-       }
+
+       ctx->bat.bat = bat;
 
        memset(ctx->bat.bat, 0, size);
        for (i = 0; i < ctx->header.max_bat_size; i++)
@@ -2787,7 +2963,7 @@ vhd_initialize_fixed_disk(vhd_context_t 
                return err;
 
        buf = mmap(0, VHD_BLOCK_SIZE, PROT_READ,
-                  MAP_SHARED | MAP_ANON, -1, 0);
+                  MAP_SHARED | MAP_ANONYMOUS, -1, 0);
        if (buf == MAP_FAILED)
                return -errno;
 
@@ -2805,7 +2981,7 @@ out:
 }
 
 int 
-vhd_get_phys_size(vhd_context_t *ctx, off_t *size)
+vhd_get_phys_size(vhd_context_t *ctx, off64_t *size)
 {
        int err;
 
@@ -2816,9 +2992,9 @@ vhd_get_phys_size(vhd_context_t *ctx, of
 }
 
 int 
-vhd_set_phys_size(vhd_context_t *ctx, off_t size)
+vhd_set_phys_size(vhd_context_t *ctx, off64_t size)
 {
-       off_t phys_size;
+       off64_t phys_size;
        int err;
 
        err = vhd_get_phys_size(ctx, &phys_size);
@@ -2835,15 +3011,40 @@ vhd_set_phys_size(vhd_context_t *ctx, of
 }
 
 static int
-__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
-               vhd_flag_creat_t flags)
+vhd_set_virt_size_no_write(vhd_context_t *ctx, uint64_t size)
+{
+       if ((size >> VHD_BLOCK_SHIFT) > ctx->header.max_bat_size) {
+               VHDLOG("not enough metadata space reserved for fast "
+                          "resize (BAT size %u, need %" PRIu64 ")\n",
+                          ctx->header.max_bat_size, size >> VHD_BLOCK_SHIFT);
+               return -EINVAL;
+       }
+
+       /* update footer */
+       ctx->footer.curr_size = size;
+       ctx->footer.geometry = vhd_chs(ctx->footer.curr_size);
+       ctx->footer.checksum = vhd_checksum_footer(&ctx->footer);
+       return 0;
+}
+
+int vhd_set_virt_size(vhd_context_t * ctx, uint64_t size)
 {
        int err;
-       off_t off;
+
+       err = vhd_set_virt_size_no_write(ctx, size);
+       if (err)
+               return err;
+       return vhd_write_footer(ctx, &ctx->footer);
+}
+
+static int
+__vhd_create(const char *name, const char *parent, uint64_t bytes, int type,
+               uint64_t mbytes, vhd_flag_creat_t flags)
+{
+       int err;
+       off64_t off;
        vhd_context_t ctx;
-       vhd_footer_t *footer;
-       vhd_header_t *header;
-       uint64_t size, blks;
+       uint64_t size, psize, blks;
 
        switch (type) {
        case HD_TYPE_DIFF:
@@ -2859,10 +3060,19 @@ static int
        if (strnlen(name, VHD_MAX_NAME_LEN - 1) == VHD_MAX_NAME_LEN - 1)
                return -ENAMETOOLONG;
 
+       if (bytes && mbytes && mbytes < bytes)
+               return -EINVAL;
+
        memset(&ctx, 0, sizeof(vhd_context_t));
-       footer = &ctx.footer;
-       header = &ctx.header;
+       psize = 0;
        blks   = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+       /* If mbytes is provided (virtual-size-for-metadata-preallocation),
+        * create the VHD of size mbytes, which will create the BAT & the 
+        * batmap of the appropriate size. Once the BAT & batmap are 
+        * initialized, reset the virtual size to the requested one.
+        */
+       if (mbytes)
+               blks = (mbytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
        size   = blks << VHD_BLOCK_SHIFT;
 
        ctx.fd = open(name, O_WRONLY | O_CREAT |
@@ -2888,15 +3098,7 @@ static int
                        goto out;
        } else {
                int raw = vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW);
-               err = vhd_initialize_header(&ctx, parent, size, raw);
-               if (err)
-                       goto out;
-
-               err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
-               if (err)
-                       goto out;
-
-               err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
+               err = vhd_initialize_header(&ctx, parent, size, raw, &psize);
                if (err)
                        goto out;
 
@@ -2913,8 +3115,28 @@ static int
                        if (err)
                                goto out;
                }
-
-               /* write header again since it may have changed */
+       }
+
+       if (mbytes) {
+               /* set the virtual size to the requested size */
+               if (bytes) {
+                       blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT;
+                       size = blks << VHD_BLOCK_SHIFT;
+
+               } else {
+                       size = psize;
+               }
+               ctx.footer.orig_size = size;
+               err = vhd_set_virt_size_no_write(&ctx, size);
+               if (err)
+                       goto out;
+       }
+
+       if (type != HD_TYPE_FIXED) {
+               err = vhd_write_footer_at(&ctx, &ctx.footer, 0);
+               if (err)
+                       goto out;
+
                err = vhd_write_header_at(&ctx, &ctx.header, VHD_SECTOR_SIZE);
                if (err)
                        goto out;
@@ -2925,7 +3147,7 @@ static int
                goto out;
 
        off = vhd_position(&ctx);
-       if (off == (off_t)-1) {
+       if (off == (off64_t)-1) {
                err = -errno;
                goto out;
        }
@@ -2947,16 +3169,17 @@ out:
 }
 
 int
-vhd_create(const char *name, uint64_t bytes, int type, vhd_flag_creat_t flags)
+vhd_create(const char *name, uint64_t bytes, int type, uint64_t mbytes,
+                  vhd_flag_creat_t flags)
 {
-       return __vhd_create(name, NULL, bytes, type, flags);
+       return __vhd_create(name, NULL, bytes, type, mbytes, flags);
 }
 
 int
 vhd_snapshot(const char *name, uint64_t bytes, const char *parent,
-               vhd_flag_creat_t flags)
+                        uint64_t mbytes, vhd_flag_creat_t flags)
 {
-       return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, flags);
+       return __vhd_create(name, parent, bytes, HD_TYPE_DIFF, mbytes, flags);
 }
 
 static int
@@ -3000,7 +3223,7 @@ static int
 __vhd_io_dynamic_read_link(vhd_context_t *ctx, char *map,
                           char *buf, uint64_t sector, uint32_t secs)
 {
-       off_t off;
+       off64_t off;
        uint32_t blk, sec;
        int err, cnt, map_off;
        char *bitmap, *data, *src;
@@ -3056,9 +3279,9 @@ static int
                char *map, char *buf, uint64_t sec, uint32_t secs)
 {
        int fd, err;
-       off_t off;
+       off64_t off;
        uint64_t size;
-       char *data;
+       void *data;
 
        err = 0;
        errno = 0;
@@ -3068,8 +3291,8 @@ static int
                return -errno;
        }
 
-       off = lseek(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
-       if (off == (off_t)-1) {
+       off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET);
+       if (off == (off64_t)-1) {
                VHDLOG("%s: seek(0x%08"PRIx64") failed: %d\n",
                       filename, vhd_sectors_to_bytes(sec), -errno);
                err = -errno;
@@ -3077,7 +3300,7 @@ static int
        }
 
        size = vhd_sectors_to_bytes(secs);
-       err = posix_memalign((void **)&data, VHD_SECTOR_SIZE, size);
+       err = posix_memalign(&data, VHD_SECTOR_SIZE, size);
        if (err)
                goto close;
 
@@ -3134,12 +3357,21 @@ static int
                }
 
                if (vhd->footer.type == HD_TYPE_DIFF) {
+                       vhd_context_t *p;
+                       p = vhd_cache_get_parent(vhd);
+                       if (p) {
+                               vhd = p;
+                               err = vhd_get_bat(vhd);
+                               if (err)
+                                       goto out;
+                               continue;
+                       }
+
                        err = vhd_parent_locator_get(vhd, &next);
                        if (err)
                                goto close;
                        if (vhd_parent_raw(vhd)) {
-                               err = __raw_read_link(next, map, buf, sec,
-                                               secs);
+                               err = __raw_read_link(next, map, buf, sec, 
secs);
                                goto close;
                        }
                } else {
@@ -3164,7 +3396,7 @@ static int
        }
 
 close:
-       if (vhd != ctx)
+       if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED))
                vhd_close(vhd);
 out:
        free(map);
@@ -3202,8 +3434,8 @@ static int
 {
        char *buf;
        size_t size;
-       off_t off, max;
-       int i, err, gap, spp;
+       off64_t off, max;
+       int err, gap, spp, secs;
 
        spp = getpagesize() >> VHD_SECTOR_SHIFT;
 
@@ -3225,8 +3457,12 @@ static int
        if (err)
                return err;
 
-       size = vhd_sectors_to_bytes(ctx->spb + ctx->bm_secs + gap);
-       buf  = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANON, -1, 0);
+       secs = ctx->bm_secs + gap;
+       if (!vhd_flag_test(ctx->oflags, VHD_OPEN_IO_WRITE_SPARSE))
+               secs += ctx->spb;
+
+       size = vhd_sectors_to_bytes(secs);
+       buf  = mmap(0, size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
        if (buf == MAP_FAILED)
                return -errno;
 
@@ -3251,7 +3487,7 @@ static int
                       char *buf, uint64_t sector, uint32_t secs)
 {
        char *map;
-       off_t off;
+       off64_t off;
        uint32_t blk, sec;
        int i, err, cnt, ret;
 
@@ -3350,3 +3586,765 @@ vhd_io_write(vhd_context_t *ctx, char *b
 
        return __vhd_io_dynamic_write(ctx, buf, sec, secs);
 }
+
+static void vhd_cache_init(vhd_context_t * ctx __attribute__((unused)))
+{
+}
+
+static int vhd_cache_enabled(vhd_context_t * ctx)
+{
+       return vhd_flag_test(ctx->oflags, VHD_OPEN_CACHED);
+}
+
+static int vhd_cache_load(vhd_context_t * ctx)
+{
+       char *next;
+       int err, pflags;
+       vhd_context_t *vhd;
+
+       err = 1;
+       pflags = ctx->oflags;
+       vhd = ctx;
+       next = NULL;
+
+       vhd_flag_set(pflags, VHD_OPEN_RDONLY);
+       vhd_flag_clear(pflags, VHD_OPEN_CACHED);
+
+       if (!vhd_cache_enabled(vhd))
+               goto done;
+
+       while (vhd->footer.type == HD_TYPE_DIFF) {
+               vhd_context_t *parent;
+
+               parent = NULL;
+
+               if (vhd_parent_raw(vhd))
+                       goto done;
+
+               err = vhd_parent_locator_get(vhd, &next);
+               if (err)
+                       goto out;
+
+               parent = calloc(1, sizeof(*parent));
+               if (!parent)
+                       goto out;
+
+               err = vhd_open(parent, next, pflags);
+               if (err) {
+                       free(parent);
+                       parent = NULL;
+                       goto out;
+               }
+
+               fcntl(parent->fd, F_SETFL, fcntl(parent->fd, F_GETFL) & 
~O_DIRECT);
+               vhd_flag_set(parent->oflags, VHD_OPEN_CACHED);
+               vhd->parent = parent;
+
+               free(next);
+               next = NULL;
+               vhd = parent;
+       }
+
+  done:
+       err = 0;
+  out:
+       free(next);
+       if (err)
+               vhd_cache_unload(vhd);
+
+       return err;
+}
+
+static int vhd_cache_unload(vhd_context_t * ctx)
+{
+       vhd_context_t *vhd;
+
+       if (!vhd_cache_enabled(ctx))
+               goto out;
+
+       vhd = ctx;
+       while ((vhd = vhd->parent)) {
+               vhd_close(vhd);
+               free(vhd);
+       }
+       ctx->parent = NULL;
+
+  out:
+       return 0;
+}
+
+static inline vhd_context_t *vhd_cache_get_parent(vhd_context_t * ctx)
+{
+       if (!vhd_cache_enabled(ctx))
+               return NULL;
+
+       return ctx->parent;
+}
+
+typedef struct vhd_block_vector vhd_block_vector_t;
+typedef struct vhd_block_vector_entry vhd_block_vector_entry_t;
+
+struct vhd_block_vector_entry {
+       uint64_t off;                           /* byte offset from block */
+       uint32_t bytes;                         /* size in bytes */
+       char *buf;                                      /* destination buffer */
+};
+
+struct vhd_block_vector {
+       uint32_t block;                         /* logical block in vhd */
+       int entries;                            /* number of vector entries */
+       vhd_block_vector_entry_t *array;        /* vector list */
+};
+
+/**
+ * @vec: block vector describing read
+ *
+ * @vec describes a list of byte-spans within a given block
+ * and a corresponding list of destination buffers.
+ */
+static int
+vhd_block_vector_read(vhd_context_t * ctx, vhd_block_vector_t * vec)
+{
+       int err, i;
+       off64_t off;
+       uint32_t blk;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               goto out;
+
+       if (vec->block >= ctx->bat.entries) {
+               err = -ERANGE;
+               goto out;
+       }
+
+       blk = ctx->bat.bat[vec->block];
+       if (blk == DD_BLK_UNUSED) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       off = vhd_sectors_to_bytes(blk + ctx->bm_secs);
+
+       for (i = 0; i < vec->entries; i++) {
+               vhd_block_vector_entry_t *v = vec->array + i;
+               err = vhd_pread(ctx, v->buf, v->bytes, off + v->off);
+               if (err)
+                       goto out;
+       }
+
+  out:
+       return err;
+}
+
+/**
+ * @vec: block vector to initialize
+ * @block: vhd block number
+ * @map: optional bitmap of sectors to map (relative to beginning of block)
+ * @buf: destination buffer
+ * @blk_start: byte offset relative to beginning of block
+ * @blk_end: byte offset relative to beginning of block
+ *
+ * initializes @vec to describe a read into a contiguous buffer
+ * of potentially non-contiguous byte ranges in a given vhd block.
+ * only sectors with corresponding bits set in @map (if it is not NULL)
+ * will be mapped; bits corresponding to unmapped sectors will be cleared.
+ * first and last sector maps may be smaller than vhd sector size.
+ */
+static int
+vhd_block_vector_init(vhd_context_t * ctx,
+                                         vhd_block_vector_t * vec, uint32_t 
block, char *map,
+                                         char *buf, uint64_t blk_start, 
uint64_t blk_end)
+{
+       int err, sec;
+       char *bitmap;
+       uint32_t first_sec, last_sec;
+
+       bitmap = NULL;
+       memset(vec, 0, sizeof(*vec));
+
+       first_sec = blk_start >> VHD_SECTOR_SHIFT;
+       last_sec = secs_round_up_no_zero(blk_end);
+
+       err = vhd_read_bitmap(ctx, block, &bitmap);
+       if (err)
+               goto out;
+
+       vec->array = calloc(ctx->spb, sizeof(vhd_block_vector_entry_t));
+       if (!vec->array) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (sec = first_sec; sec < last_sec; sec++) {
+               uint32_t cnt;
+               vhd_block_vector_entry_t *v;
+
+               cnt = VHD_SECTOR_SIZE - (blk_start & (VHD_SECTOR_SIZE - 1));
+               if (cnt > blk_end - blk_start)
+                       cnt = blk_end - blk_start;
+
+               if (map && !test_bit(map, sec))
+                       goto next;
+
+               if (vhd_bitmap_test(ctx, bitmap, sec)) {
+                       if (vec->entries > 0) {
+                               v = vec->array + vec->entries - 1;
+                               if (v->off + v->bytes == blk_start) {
+                                       v->bytes += cnt;
+                                       goto next;
+                               }
+                       }
+
+                       v = vec->array + vec->entries;
+                       v->off = blk_start;
+                       v->bytes = cnt;
+                       v->buf = buf;
+
+                       vec->entries++;
+
+               } else if (map) {
+                       clear_bit(map, sec);
+               }
+
+         next:
+               blk_start += cnt;
+               buf += cnt;
+       }
+
+       vec->block = block;
+
+  out:
+       free(bitmap);
+       return err;
+}
+
+#if 0
+/**
+ * @block: vhd block number
+ * @buf: buffer to place data in
+ * @size: number of bytes to read
+ * @start: byte offset into block from which to start reading
+ * @end: byte offset in block at which to stop reading
+ *
+ * reads data (if it exists) into @buf.  partial reads may occur
+ * for the first and last sectors if @start and @end are not multiples
+ * of vhd sector size.
+ */
+static int
+vhd_block_vector_read_allocated(vhd_context_t * ctx, uint32_t block,
+                                                               char *buf, 
uint64_t start, uint64_t end)
+{
+       int err;
+       vhd_block_vector_t vec;
+
+       vec.array = NULL;
+
+       err = vhd_block_vector_init(ctx, &vec, block, NULL, buf, start, end);
+       if (err)
+               goto out;
+
+       err = vhd_block_vector_read(ctx, &vec);
+
+  out:
+       free(vec.array);
+       return err;
+}
+#endif
+
+/**
+ * @block: vhd block number
+ * @map: bitmap of sectors in block which should be read
+ * @buf: buffer to place data in
+ * @start: byte offset into block from which to start reading
+ * @end: byte offset in block at which to stop reading
+ *
+ * for every bit set in @map (corresponding to sectors in @block),
+ * reads data (if it exists) into @buf.  if data does not exist,
+ * clears corresponding bit in @map.  partial reads may occur
+ * for the first and last sectors if @start and @end are not multiples
+ * of vhd sector size.
+ */
+static int
+vhd_block_vector_read_allocated_selective(vhd_context_t * ctx,
+                                                                               
  uint32_t block, char *map,
+                                                                               
  char *buf, uint64_t start,
+                                                                               
  uint64_t end)
+{
+       int err;
+       vhd_block_vector_t vec;
+
+       vec.array = NULL;
+
+       err = vhd_block_vector_init(ctx, &vec, block, map, buf, start, end);
+       if (err)
+               goto out;
+
+       err = vhd_block_vector_read(ctx, &vec);
+
+  out:
+       free(vec.array);
+       return err;
+}
+
+/**
+ * @map: bitmap of sectors which have already been read
+ * @buf: destination buffer
+ * @size: size in bytes to read
+ * @off: byte offset in virtual disk to read
+ *
+ * reads @size bytes into @buf, starting at @off, skipping sectors
+ * which have corresponding bits set in @map
+ */
+static int
+__vhd_io_dynamic_read_link_bytes(vhd_context_t * ctx, char *map,
+                                                                char *buf, 
size_t size, uint64_t off)
+{
+       char *blkmap;
+       int i, err, map_off;
+       off64_t blk_off, blk_size;
+       uint32_t blk, bytes, first_sec, last_sec;
+
+       blkmap = malloc((ctx->spb + 7) >> 3);
+       if (!blkmap) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       map_off = 0;
+       blk_size = vhd_sectors_to_bytes(ctx->spb);
+
+       do {
+               blk = off / blk_size;
+               blk_off = off % blk_size;
+               bytes = MIN(blk_size - blk_off, size);
+
+               first_sec = blk_off >> VHD_SECTOR_SHIFT;
+               last_sec = secs_round_up_no_zero(blk_off + bytes);
+
+               if (ctx->bat.bat[blk] == DD_BLK_UNUSED)
+                       goto next;
+
+               memset(blkmap, 0, (ctx->spb + 7) >> 3);
+
+               for (i = 0; i < (last_sec - first_sec); i++)
+                       if (!test_bit(map, map_off + i))
+                               set_bit(blkmap, first_sec + i);
+
+               err = vhd_block_vector_read_allocated_selective(ctx, blk,
+                                                                               
                                blkmap, buf,
+                                                                               
                                blk_off,
+                                                                               
                                blk_off + bytes);
+               if (err)
+                       goto out;
+
+               for (i = 0; i < (last_sec - first_sec); i++)
+                       if (test_bit(blkmap, first_sec + i))
+                               set_bit(map, map_off + i);
+
+         next:
+               size -= bytes;
+               off += bytes;
+               map_off += (last_sec - first_sec);
+               buf += bytes;
+
+       } while (size);
+
+       err = 0;
+  out:
+       free(blkmap);
+       return err;
+}
+
+static int
+__raw_read_link_bytes(const char *filename,
+                                         char *map, char *buf, size_t size, 
uint64_t off)
+{
+       int fd, err;
+       uint32_t i, first_sec, last_sec;
+
+       fd = open(filename, O_RDONLY | O_LARGEFILE);
+       if (fd == -1) {
+               VHDLOG("%s: failed to open: %d\n", filename, -errno);
+               return -errno;
+       }
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec = secs_round_up_no_zero(off + size);
+
+       for (i = first_sec; i < last_sec; i++) {
+               if (!test_bit(map, i - first_sec)) {
+                       uint32_t secs = 0;
+                       uint64_t coff, csize;
+
+                       while (i + secs < last_sec &&
+                                  !test_bit(map, i + secs - first_sec))
+                               secs++;
+
+                       coff = vhd_sectors_to_bytes(i);
+                       csize = vhd_sectors_to_bytes(secs);
+
+                       if (i == first_sec)
+                               coff = off;
+                       if (secs == last_sec - 1)
+                               csize = (off + size) - coff;
+
+                       if (pread(fd, buf + coff - off, csize, coff) != csize) {
+                               err = (errno ? -errno : -EIO);
+                               goto close;
+                       }
+
+                       i += secs - 1;
+               }
+       }
+
+       err = 0;
+
+  close:
+       close(fd);
+       return err;
+}
+
+static int
+__vhd_io_dynamic_read_bytes(vhd_context_t * ctx,
+                                                       char *buf, size_t size, 
uint64_t off)
+{
+       int err;
+       char *next, *map;
+       vhd_context_t parent, *vhd;
+       uint32_t i, done, first_sec, last_sec;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec = secs_round_up_no_zero(off + size);
+
+       vhd = ctx;
+       next = NULL;
+       map = calloc(1, ((last_sec - first_sec) + 7) >> 3);
+       if (!map) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       for (;;) {
+               err = __vhd_io_dynamic_read_link_bytes(vhd, map, buf, size, 
off);
+               if (err)
+                       goto close;
+
+               for (done = 0, i = 0; i < (last_sec - first_sec); i++)
+                       if (test_bit(map, i))
+                               done++;
+
+               if (done == last_sec - first_sec) {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd->footer.type == HD_TYPE_DIFF) {
+                       vhd_context_t *p;
+                       p = vhd_cache_get_parent(vhd);
+                       if (p) {
+                               vhd = p;
+                               err = vhd_get_bat(vhd);
+                               if (err)
+                                       goto out;
+                               continue;
+                       }
+
+                       err = vhd_parent_locator_get(vhd, &next);
+                       if (err)
+                               goto close;
+
+                       if (vhd_parent_raw(vhd)) {
+                               err = __raw_read_link_bytes(next, map, buf, 
size, off);
+                               goto close;
+                       }
+               } else {
+                       err = 0;
+                       goto close;
+               }
+
+               if (vhd != ctx)
+                       vhd_close(vhd);
+               vhd = &parent;
+
+               err = vhd_open(vhd, next, VHD_OPEN_RDONLY);
+               if (err)
+                       goto out;
+
+               err = vhd_get_bat(vhd);
+               if (err)
+                       goto close;
+
+               free(next);
+               next = NULL;
+       }
+
+  close:
+       if (!err) {
+               /*
+                * clear any regions not present on disk
+                */
+               for (i = first_sec; i < last_sec; i++) {
+                       if (!test_bit(map, i - first_sec)) {
+                               uint64_t coff = vhd_sectors_to_bytes(i);
+                               uint32_t csize = VHD_SECTOR_SIZE;
+
+                               if (i == first_sec)
+                                       coff = off;
+                               if (i == last_sec - 1)
+                                       csize = (off + size) - coff;
+
+                               memset(buf + coff - off, 0, csize);
+                       }
+               }
+       }
+
+       if (vhd != ctx && !vhd_flag_test(vhd->oflags, VHD_OPEN_CACHED))
+               vhd_close(vhd);
+  out:
+       free(map);
+       free(next);
+       return err;
+}
+
+int
+vhd_io_read_bytes(vhd_context_t * ctx, void *buf, size_t size,
+                                 uint64_t off)
+{
+       if (off + size > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return vhd_pread(ctx, buf, size, off);
+
+       return __vhd_io_dynamic_read_bytes(ctx, buf, size, off);
+}
+
+static int
+__vhd_io_dynamic_write_bytes_aligned(vhd_context_t * ctx,
+                                                                        char 
*buf, size_t size, uint64_t off)
+{
+       char *map;
+       int i, err, ret;
+       uint64_t blk_off, blk_size, blk_start;
+       uint32_t blk, bytes, first_sec, last_sec;
+
+       if (off & (VHD_SECTOR_SIZE - 1) || size & (VHD_SECTOR_SIZE - 1))
+               return -EINVAL;
+
+       err = vhd_get_bat(ctx);
+       if (err)
+               return err;
+
+       if (vhd_has_batmap(ctx)) {
+               err = vhd_get_batmap(ctx);
+               if (err)
+                       return err;
+       }
+
+       map = NULL;
+       blk_size = vhd_sectors_to_bytes(ctx->spb);
+
+       do {
+               blk = off / blk_size;
+               blk_off = off % blk_size;
+               bytes = MIN(blk_size - blk_off, size);
+
+               first_sec = blk_off >> VHD_SECTOR_SHIFT;
+               last_sec = secs_round_up_no_zero(blk_off + bytes);
+
+               blk_start = ctx->bat.bat[blk];
+               if (blk_start == DD_BLK_UNUSED) {
+                       err = __vhd_io_allocate_block(ctx, blk);
+                       if (err)
+                               goto fail;
+
+                       blk_start = ctx->bat.bat[blk];
+               }
+
+               blk_start = vhd_sectors_to_bytes(blk_start + ctx->bm_secs);
+
+               err = vhd_pwrite(ctx, buf, bytes, blk_start + blk_off);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx) && vhd_batmap_test(ctx, &ctx->batmap, 
blk))
+                       goto next;
+
+               err = vhd_read_bitmap(ctx, blk, &map);
+               if (err) {
+                       map = NULL;
+                       goto fail;
+               }
+
+               for (i = first_sec; i < last_sec; i++)
+                       vhd_bitmap_set(ctx, map, i);
+
+               err = vhd_write_bitmap(ctx, blk, map);
+               if (err)
+                       goto fail;
+
+               if (vhd_has_batmap(ctx)) {
+                       for (i = 0; i < ctx->spb; i++)
+                               if (!vhd_bitmap_test(ctx, map, i)) {
+                                       free(map);
+                                       map = NULL;
+                                       goto next;
+                               }
+
+                       vhd_batmap_set(ctx, &ctx->batmap, blk);
+                       err = vhd_write_batmap(ctx, &ctx->batmap);
+                       if (err)
+                               goto fail;
+               }
+
+               free(map);
+               map = NULL;
+
+         next:
+               size -= bytes;
+               off += bytes;
+               buf += bytes;
+
+       } while (size);
+
+       err = 0;
+
+  out:
+       ret = vhd_write_footer(ctx, &ctx->footer);
+       return (err ? err : ret);
+
+  fail:
+       free(map);
+       goto out;
+}
+
+static int
+__vhd_io_dynamic_write_bytes(vhd_context_t * ctx,
+                                                        char *buf, size_t 
size, uint64_t off)
+{
+       int err;
+       char *tmp;
+       uint32_t first_sec, last_sec, first_sec_off, last_sec_off;
+
+       err = 0;
+       tmp = NULL;
+
+       first_sec = off >> VHD_SECTOR_SHIFT;
+       last_sec = secs_round_up_no_zero(off + size);
+
+       first_sec_off = off & (VHD_SECTOR_SIZE - 1);
+       last_sec_off = (off + size) & (VHD_SECTOR_SIZE - 1);
+
+       if (first_sec_off || last_sec_off) {
+               tmp = malloc(VHD_SECTOR_SIZE);
+               if (!tmp) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+
+               if (first_sec_off) {
+                       uint32_t new = VHD_SECTOR_SIZE - first_sec_off;
+                       if (new > size)
+                               new = size;
+
+                       err = vhd_io_read_bytes(ctx, tmp, VHD_SECTOR_SIZE,
+                                                                       
vhd_sectors_to_bytes(first_sec));
+                       if (err)
+                               goto out;
+
+                       memcpy(tmp + first_sec_off, buf, new);
+
+                       err =
+                               __vhd_io_dynamic_write_bytes_aligned(ctx, tmp,
+                                                                               
                         VHD_SECTOR_SIZE,
+                                                                               
                         vhd_sectors_to_bytes
+                                                                               
                         (first_sec));
+                       if (err)
+                               goto out;
+
+                       buf += new;
+                       off += new;
+                       size -= new;
+               }
+
+               if (last_sec_off && (last_sec - first_sec > 1 || 
!first_sec_off)) {
+                       uint32_t new = last_sec_off;
+
+                       err = vhd_io_read_bytes(ctx, tmp, VHD_SECTOR_SIZE,
+                                                                       
vhd_sectors_to_bytes(last_sec - 1));
+                       if (err)
+                               goto out;
+
+                       memcpy(tmp, buf + size - new, new);
+
+                       err =
+                               __vhd_io_dynamic_write_bytes_aligned(ctx, tmp,
+                                                                               
                         VHD_SECTOR_SIZE,
+                                                                               
                         vhd_sectors_to_bytes
+                                                                               
                         (last_sec - 1));
+                       if (err)
+                               goto out;
+
+                       size -= new;
+               }
+       }
+
+       if (size)
+               err = __vhd_io_dynamic_write_bytes_aligned(ctx, buf, size, off);
+
+  out:
+       free(tmp);
+       return err;
+}
+
+int
+vhd_io_write_bytes(vhd_context_t * ctx, void *buf, size_t size,
+                                  uint64_t off)
+{
+       if (off + size > ctx->footer.curr_size)
+               return -ERANGE;
+
+       if (!vhd_type_dynamic(ctx))
+               return vhd_pwrite(ctx, buf, size, off);
+
+       return __vhd_io_dynamic_write_bytes(ctx, buf, size, off);
+}
+
+int vhd_marker(vhd_context_t * ctx, char *marker)
+{
+       int err;
+       vhd_batmap_t batmap;
+
+       *marker = 0;
+
+       if (!vhd_has_batmap(ctx))
+               return -ENOSYS;
+
+       err = vhd_read_batmap_header(ctx, &batmap);
+       if (err)
+               return err;
+
+       *marker = batmap.header.marker;
+       return 0;
+}
+
+int vhd_set_marker(vhd_context_t * ctx, char marker)
+{
+       int err;
+       vhd_batmap_t batmap;
+
+       if (!vhd_has_batmap(ctx))
+               return -ENOSYS;
+
+       err = vhd_read_batmap_header(ctx, &batmap);
+       if (err)
+               return err;
+
+       batmap.header.marker = marker;
+       return vhd_write_batmap_header(ctx, &batmap);
+}

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.