[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool
[PATCH 4/4] (Refactored) Add libvdisk, and vdisk_tool vdisk-support.patch provides libvdisk, and vdisk_tool, as described in [PATCH 0/4] Signed-off-by: Boris Ostrovsky <bostrovsky@xxxxxxxxxxxxxxx> Signed-off-by: Ben Guthro <bguthro@xxxxxxxxxxxxxxx> diff -r 75c61490cc06 tools/Makefile --- a/tools/Makefile Thu Jun 21 13:05:29 2007 -0400 +++ b/tools/Makefile Thu Jun 21 13:05:31 2007 -0400 @@ -17,6 +17,7 @@ SUBDIRS-$(VTPM_TOOLS) += vtpm SUBDIRS-$(VTPM_TOOLS) += vtpm SUBDIRS-y += xenstat SUBDIRS-y += libaio +SUBDIRS-y += vdisk SUBDIRS-y += blktap SUBDIRS-y += libfsimage SUBDIRS-$(XENFB_TOOLS) += xenfb diff -r 75c61490cc06 tools/vdisk/Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/Makefile Thu Jun 21 13:05:45 2007 -0400 @@ -0,0 +1,65 @@ +# +# Copyright (c) 2003-2007, Virtual Iron Software, Inc. +# +# Portions have been modified by Virtual Iron Software, Inc. +# (c) 2007. This file and the modifications can be redistributed and/or +# modified under the terms and conditions of the GNU General Public +# License, version 2.1 and not any later version of the GPL, as published +# by the Free Software Foundation. +# +XEN_ROOT = ../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVHD_SRC = vhd.c vhd_utils.c +LIBVDISK_SRC = vdisk_utils.c vdisk_common.c +TOOL_SRC = vdisk_tool.c + +LIBAIO_DIR = ../libaio/src +BLKTAP_DIR = ../blktap/drivers + +CFLAGS = -O2 -fno-strict-aliasing -fPIC -Wall -Werror -rdynamic \ + -D_FILE_OFFSET_BITS=64 \ + -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -I./ \ + -I$(LIBAIO_DIR) \ + -I$(BLKTAP_DIR) + +LIB_LDFLAGS = -dy -shared -L$(LIBAIO_DIR) -laio + +INSTALL = /usr/bin/install + +all: default +default: vdisk_tool libvdisk_vhd.so libvdisk.so + + +%.o: %.c + $(CC) $(CFLAGS) -rdynamic -c $< -o $@ + +vdisk_tool: $(TOOL_SRC:%.c=%.o) libvdisk_vhd.so libvdisk.so + gcc $(LOCAL_CFLAGS) -o vdisk_tool -g $(TOOL_SRC) -L./ \ + -I$(LIBAIO_DIR) \ + -I$(BLKTAP_DIR) \ + -L$(LIBAIO_DIR) -L. -lvdisk -ldl -laio + +libvdisk_vhd.so: $(LIBVHD_SRC:%.c=%.o) libvdisk.so + $(LD) $(LIB_LDFLAGS) -o $@ $^ + +libvdisk.so: $(LIBVDISK_SRC:%.c=%.o) + $(LD) $(LIB_LDFLAGS) -o $@ $^ + +install: all + $(INSTALL) -d $(DESTDIR)/usr/bin + $(INSTALL) -d $(DESTDIR)/usr/lib64 + $(INSTALL) vdisk_tool $(DESTDIR)/usr/bin + $(INSTALL) libvdisk_vhd.so libvdisk.so $(DESTDIR)/usr/lib64 + $(INSTALL) -d $(DESTDIR)/usr/include + for header in *.h; do $(INSTALL) $$header $(DESTDIR)/usr/include; done + +clean: + /bin/rm -f *.o libvdisk_vhd.so vdisk_tool libvdisk.so + +depend .depend dep: + $(CC) $(CFLAGS) -M $(LIBVDISK_SRC) $(LIBVHD_SRC) $(TOOL_SRC)> .depend + +ifeq (.depend,$(wildcard .depend)) +include .depend +endif diff -r 75c61490cc06 tools/vdisk/list.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/list.h Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,168 @@ +// Copy of /usr/include/linux/list.h that does not +// depend on __KERNEL__ and _LVM_H_INCLUDE + +#ifndef _LIST_H +#define _LIST_H + + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline__ void __list_add(struct list_head * new, + struct list_head * prev, + struct list_head * next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static __inline__ void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static __inline__ void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static __inline__ void __list_del(struct list_head * prev, + struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static __inline__ void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = entry->prev = 0; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static __inline__ void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static __inline__ int list_empty(struct list_head *head) +{ + return head->next == head; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static __inline__ void list_splice(struct list_head *list, struct list_head *head) +{ + struct list_head *first = list->next; + + if (first != list) { + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); \ + pos = pos->next) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + + + +#endif diff -r 75c61490cc06 tools/vdisk/vdisk.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vdisk.h Thu Jun 21 13:05:48 2007 -0400 @@ -0,0 +1,215 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#ifndef __VDISK_H +#define __VDISK_H + +#include <sys/types.h> +#include <stdlib.h> +#include <stdint.h> +#include <inttypes.h> +#include <linux/limits.h> +#include <syslog.h> +#include <libaio.h> +#include "list.h" +#include "tapaio.h" + +// vdisk_tool's operations +#define VDISK_OP_CREATE (1<<0) +#define VDISK_OP_HEADERS (1<<1) +#define VDISK_OP_DUMP (1<<2) +#define VDISK_OP_MODIFY (1<<3) + +// Return codes +#define VID_BLOCK_MAPPED (0) +#define VID_BLOCK_NOTMAPPED (-1) +#define VID_BLOCK_TOOBIG (-2) +#define VID_BLOCK_MAPERR (-3) + +// IO operation codes +#define VDISK_READ (0) +#define VDISK_WRITE (1) + +// Async IO macros +#define VDISK_HASH_SZ (2048) +#define VDISK_HASH_IDX(x) ((x) & (VDISK_HASH_SZ-1)) +#define VDISK_INVALID_HASH (-1) +#define REQUEST_ASYNC_FD (1) // Should really be defined in kernel + +#define SECTOR_SIZE (512) + +// vdisk device flags +#define VDISK_SYNCIO_BUF (1<<0) +#define VDISK_RO (1<<1) + +// vdisk file flags +#define VDF_LEAF (1<<0) // last COW child (writeable) + +// Statistics gathering +#define VDISK_STATS (0) +#define VDISK_SYNCIO_STATS (0) + +#if VDISK_STATS +#define DO_STATS(x) x +#else +#define DO_STATS(x) +#endif + + + + +// Datatype for addressing host memory +#if defined __x86_64__ +typedef uint64_t addr_t; +#else +typedef uint32_t addr_t; +#endif + +typedef int file_t; + +// Forward declaration +struct vdisk_dev; + +// Stores info about a pending async IO +typedef struct pending_aio { + uint32_t block; + uint32_t num_blocks; + void *arg; + void *aiocb; + off_t off; + file_t fd; + int op; + int res; +} pending_aio_t; + +// Hash that stores async IO data +typedef struct vdisk_hash { + uint64_t key; + struct iocb io; + pending_aio_t pio; +} vdisk_hash_t; + +// run data to allow coalescing of writes when doing posix_fadvise() sync/flush +typedef struct vdisk_syncio { + int is_set; + off_t io_start; + off_t io_len; +#if VDISK_SYNCIO_STATS + unsigned long total_writes; + unsigned long contig_writes; + unsigned long flush_size_sub1MB; + unsigned long flush_size_sub2MB; + unsigned long flush_size_sub4MB; + unsigned long flush_size_sub8MB; + unsigned long flush_size_ovr8MB; + unsigned long flush_size_force; + time_t last_dbg_print; +#endif +} vdisk_syncio_t; + +// Per-file structure +typedef struct vd_file { + struct list_head vdf_list; + char name[PATH_MAX]; + file_t fd; + int flags; + int batch_sz; // number of blocks that are mapped sequentially + void *vdf; // format-specific data + vdisk_syncio_t *syncio; // allows sync io to buffer in pagecache for + // better io performance +} vd_file_t; + +// Data describing format's properties (ops etc.) +typedef struct vdf_data { + char ftype[8]; // File name extension + + int (*open)(struct vdisk_dev *vdisk, char *filename); + void (*close)(struct vdisk_dev *vdisk); + int (*map_block)(vd_file_t *vf, uint32_t *blockno, int num_blocks, + int op, void **arg); + int (*xfer_commit)(void *arg, int err); + int (*print_header)(vd_file_t *vf); + int (*parse_args)(int argc, int operations, char *argv[], void **optp); + int (*create_vdisk)(char *filename, void *optp); + int (*modify_vdisk)(struct vdisk_dev *vdisk, void *optp); + struct list_head vdfd_list; // connects to global format list +} vdf_data_t; + +// Top-level datastructure +typedef struct vdisk_dev { + + struct vdisk_geom { + int cyls; + int heads; + int secs; + } geom; + + ssize_t sz; // Device size (bytes) + + int flags; + + // head of vdisk files (vd_file_t) list + struct list_head vdf_head; + + vdf_data_t *vdfd; + + // AIO data + vdisk_hash_t hash[VDISK_HASH_SZ]; + struct iocb *aio_submit[VDISK_HASH_SZ]; + struct io_event aio_events[VDISK_HASH_SZ]; + tap_aio_context_t aio_ctx; + int use_aio; + int aio_fd; + int aio_cnt; + + // Stats + uint64_t busyio; + uint64_t syncio; + uint64_t asyncio; + uint64_t tot_io; +} vdisk_dev_t; + +struct program_props { + void *alloc_func; + void *free_func; + int out_target; +}; + + +#define VDISK_OUT_STDERR (0) +#define VDISK_OUT_SYSLOG (1) +extern int vdisk_dbg_level; +extern int vdisk_out_target; +#define VIDDBG(n, fmt, args...) vdisk_log_error(n, __FILE__, __LINE__, fmt, ##args) + +#define ASSERT(expr) \ + ((expr) ? 0 : \ + ({ \ + VIDDBG(0, "Assertion failed: %s\n", __STRING(expr)); \ + abort(); \ + })); + +extern int vdisk_pagesz; //4K + +extern void vdisk_log_error(int level, char *file, int line, char *fmt, ...); +extern int vdf_read_state(vdisk_dev_t *vdisk, char *filename); +extern int vdf_print_headers(vdisk_dev_t *vdisk, char *filename); +extern int vdisk_register (vdf_data_t *vdfd); +extern void vdisk_unregister (vdf_data_t *vdfd); +extern int vdf_init(vdisk_dev_t *vdisk, char *fname); +extern int vdisk_common_init(vdisk_dev_t *vdisk); +extern int vdf_find_vdfd(vdisk_dev_t *vdisk, char *ftype); +extern int vdisk_xfer_cb(vdisk_dev_t *vdisk, struct pending_aio *pio); +extern int vdisk_rw(void *hdl, int64_t sector_num, + uint8_t *buf, int nb_sectors, int write, void *aiocb); +extern void vdisk_alloc_init(void *alloc_func, void *free_func); +extern int vdisk_init(vdisk_dev_t *vdisk, char *filename, + struct program_props *props, uint8_t flags); +extern void vdisk_fini(vdisk_dev_t *vdisk); + +#endif /* __VDISK_H */ diff -r 75c61490cc06 tools/vdisk/vdisk_common.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vdisk_common.c Thu Jun 21 13:05:53 2007 -0400 @@ -0,0 +1,616 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <fcntl.h> +#include <dlfcn.h> + +#include "vdisk.h" +#include "vdisk_utils.h" + + +static int vdisk_initialized = 0; +int vdisk_pagesz = 0; + +void vdisk_fini(vdisk_dev_t *vdisk) +{ + struct list_head *ptr; + vd_file_t *vdf; + + // We may have already closed the device + if ((vdisk == NULL) || (vdisk->vdfd == NULL) || + (vdisk->vdfd->close == NULL)) + return; + + list_for_each(ptr, &vdisk->vdf_head) { + vdf = list_entry(ptr, vd_file_t, vdf_list); + free(vdf->syncio); + vdf->syncio = NULL; + } + + vdisk->vdfd->close(vdisk); +} + +int vdisk_init(vdisk_dev_t *vdisk, char *filename, + struct program_props *props, uint8_t flags) +{ + int err; + char *fname; + + vdisk_common_init(NULL/*XXX: ?? */); + + if (props != NULL) { + // Set where output is directed + vdisk_out_target = props->out_target; + vdisk_alloc_init(props->alloc_func, props->free_func); + } else { + vdisk_out_target = VDISK_OUT_STDERR; + vdisk_alloc_init(NULL, NULL); + } + + fname = strchr(filename, ':'); + if (fname == NULL) + fname = filename; + else + fname++; + + vdisk->flags = flags; + + err = vdf_init(vdisk, fname); + if (err != 0) { + VIDDBG(0, "Can't initialize format's data for %s\n", + filename); + return (err); + } + + return (0); +} + +int +vdf_init(vdisk_dev_t *vdisk, char *fname) +{ + char *ext; + int err; + struct list_head *ptr; + vd_file_t *vdf; + + ext = strrchr(fname, '.'); + if (ext == NULL) { + VIDDBG(0, "Can't determine file type for %s\n", fname); + return (EINVAL); + } + + ext++; // Skip '.' + + err = vdf_find_vdfd(vdisk, ext); + if (err) { + VIDDBG(0, "Can't find format's data\n"); + return (err); + } + + err = vdf_read_state(vdisk, fname); + if (err) { + VIDDBG(0, "failed to read headers\n"); + return (-1); + } + + if (vdisk->flags & VDISK_SYNCIO_BUF) { + list_for_each(ptr, &vdisk->vdf_head) { + vdf = list_entry(ptr, vd_file_t, vdf_list); + vdf->syncio = calloc( 1, sizeof(vdisk_syncio_t)); + if (!vdf->syncio) { + VIDDBG(0, "vdisk_alloc_syncio_run_data() " + "failed '%s', thus no speed up\n", + strerror(errno)); + } + } + } + + return (0); +} + +int +vdisk_map_block(struct vdisk_dev *dev, + uint32_t *blockno, /* IN/OUT */ + int op, + vd_file_t **vf, + void **arg) +{ + struct list_head *ptr; + vd_file_t *vdf; + int res = VID_BLOCK_NOTMAPPED; + + list_for_each(ptr, &dev->vdf_head) { + + *vf = vdf = list_entry(ptr, vd_file_t, vdf_list); + + res = dev->vdfd->map_block(vdf, blockno, 1, op, arg); + if (res == VID_BLOCK_MAPPED) + return (res); + } + + if (op == VDISK_WRITE) + VIDDBG(0, "Couldn't map block %d\n", *blockno); + + return (res); +} + +int +vdf_read_state(vdisk_dev_t *vdisk, char *filename) +{ + int err; + int i; + + INIT_LIST_HEAD(&vdisk->vdf_head); + + if (vdisk->use_aio) { + for (i=0;i<VDISK_HASH_SZ;i++) + vdisk->hash[i].key = VDISK_INVALID_HASH; + + memset(&vdisk->aio_ctx.aio_ctx, 0, sizeof(io_context_t)); + err = io_queue_init(100, &vdisk->aio_ctx.aio_ctx); + if (err) { + VIDDBG(0, "io_queue_init() failed: %s. " + " Async IO will not be available\n", + strerror(-1*err)); + vdisk->use_aio = 0; + } + } + + err = vdisk->vdfd->open(vdisk, filename); + if (err) { + VIDDBG(0, "Problems opening vdisk %s (error %d)\n", + filename, err); + return (err); + } + return (0); +} + +int +vdf_print_headers(vdisk_dev_t *vdisk, char *filename) +{ + int err; + vd_file_t *vf; + //struct list_head *ptr; + + err = vdf_read_state(vdisk, filename); + if (err) { + VIDDBG(0, "Failed to read state for %s\n", filename); + return (err); + } + +#if 0 + list_for_each(ptr, &vdisk->vdf_head) { + + vf = list_entry(ptr, vd_file_t, vdf_list); + (void)vdisk->vdfd->print_header(vf->vdf); + } +#endif + vf = list_entry(vdisk->vdf_head.next, vd_file_t, vdf_list); + (void)vdisk->vdfd->print_header(vf); + + return (0); +} + +int +vdisk_xfer_cb(vdisk_dev_t *vdisk, struct pending_aio *pio) +{ + uint32_t blk; + int err = 0; + + ASSERT(pio != NULL); + + err = vdisk->vdfd->xfer_commit(pio->arg, pio->res); + if (err) + VIDDBG(0, "Failed to commit transfer (error %d)\n", err); + + if (pio->op == VDISK_WRITE) { + err = fsync(pio->fd); + if (err) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + } + + /* + * posix_fadvise() (or, rather, kernel's sys_fadvise64_64()) + * invalidates whole pages only. + */ + err = posix_fadvise(pio->fd, (pio->off & (~((off_t)vdisk_pagesz-1))), + (ssize_t)(pio->num_blocks<<9) + (off_t)vdisk_pagesz, + POSIX_FADV_DONTNEED); + if (err) + VIDDBG(0, "posix_fadvise: %s\n", strerror(errno)); + + + for (blk=pio->block; blk < (pio->block + pio->num_blocks); blk++) + vdisk->hash[VDISK_HASH_IDX(blk)].key = VDISK_INVALID_HASH; + + return (err); +} + + +int vdisk_rw(void *hdl, int64_t block, + uint8_t *buf, int nb_blocks, + int op, void *aiocb) +{ + off_t offset; + unsigned long bytes; + uint32_t real_block, blk; + vd_file_t *vdf = NULL; + void *arg = NULL; + struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl; + int i; + struct list_head *ptr; + int res = 0; + char *b = (char *)buf; + char *pool = NULL; + int batch; + int use_aio = vdisk->use_aio; + int busy = 0; + int hash_index; + int zero_blocks = 0; + + VIDDBG(50, "block=0x%" PRIx64 ", nb_blocks=%d\n", + block, nb_blocks); + + if (((block + (nb_blocks-1)) << 9) >= vdisk->sz) { + return (-ENOSPC); + } + + vdisk->tot_io++; + + if (use_aio) { + // Check whether the hash has available slots and reserve them + // We reserve them as we go because we want to make sure that + // the request fits in the hash. + for (i=0, blk=block; i<nb_blocks; i++, blk++) { + hash_index = VDISK_HASH_IDX(blk); + VIDDBG(50, "block=0x%" PRIx64 ", nb_blocks=%d i=%d " + "blk=0x%x, vdisk->hash.key[%d]=0x%" PRIx64 "\n", + block, nb_blocks, i, + blk, hash_index, + vdisk->hash[hash_index].key); + if (vdisk->hash[hash_index].key != VDISK_INVALID_HASH) { + vdisk->busyio++; + if (vdisk->hash[hash_index].key != blk) + busy = 1; + use_aio = 0; + break; + } + vdisk->hash[hash_index].key = blk; + VIDDBG(50, "hash_index=%d, blk=%d\n", + hash_index, blk); + } + + // We need to free hash entries that we've just reserved. + if (!use_aio) { + uint32_t b; + + VIDDBG(50, "Freeing hash for block %" PRId64 "\n", + block); + if (blk != 0) { + for (b=blk-1; b>=block; b--) { + hash_index = VDISK_HASH_IDX(b); + vdisk->hash[hash_index].key = + VDISK_INVALID_HASH; + } + } + VIDDBG(50, "Done\n"); + if (busy) { + VIDDBG(50, "Busy\n"); + return (-EBUSY); + } + vdisk->syncio++; + } + } + + // We can only transfer to/from an aligned buffer + if ((addr_t)buf & 511) { + b = pool = vdisk_malloc((nb_blocks+1) * 512); + if (pool == NULL) { + VIDDBG(0, "Can't create buffer\n"); + return (-ENOMEM); + } + while ((addr_t)b & 511) b++; + VIDDBG(10, "Aligned buffer %p (pool %p, b %p)\n", buf, pool, b); + + use_aio = 0; + } + + i = 0; // block in the buf[] + while (nb_blocks>0) { + + // Find largest contiguous set of blocks that we + // we can access in a single IO. + + batch = nb_blocks; + again: + arg = NULL; + list_for_each(ptr, &vdisk->vdf_head) { + + vdf = list_entry(ptr, vd_file_t, vdf_list); + + real_block = (uint32_t)block; + + // Make batch fit into a single vdf->batch_sz + if ( ((block + batch - 1) & ~(vdf->batch_sz-1)) + != (block & ~(vdf->batch_sz-1))) + batch = ( (block + vdf->batch_sz) & + ~(vdf->batch_sz-1) ) + - block; + + // Map the requested block set to address in the file + res = vdisk->vdfd->map_block(vdf, &real_block, + batch, op, &arg); + + if (res == VID_BLOCK_TOOBIG) { + // Some blocks are mapped and some are not. + // Need to try a smaller batch + + batch >>= 1; + if (!batch) { + int j; + // Free hash entries + for (j=0,blk=block; j<nb_blocks; j++,blk++) { + hash_index = VDISK_HASH_IDX(blk); + ASSERT(vdisk->hash[hash_index].key + == blk); + vdisk->hash[hash_index].key = + VDISK_INVALID_HASH; + } + + VIDDBG(0, "Inconsistent mapping error\n"); + return EINVAL; + } + goto again; + } + + if ((res != VID_BLOCK_NOTMAPPED) || + ((vdf->flags & VDF_LEAF) && (op == VDISK_WRITE))) + break; + } + + if (res != VID_BLOCK_MAPPED) { + + // Unallocated blocks return zeroes for reads + if ((op == VDISK_READ) && (res == VID_BLOCK_NOTMAPPED)) { + + if (use_aio) { + int j; + // Free up hash entries + for (j=0,blk=block; j<batch; j++,blk++) { + hash_index = VDISK_HASH_IDX(blk); + ASSERT(vdisk->hash[hash_index].key + == blk); + vdisk->hash[hash_index].key = + VDISK_INVALID_HASH; + } + } + + memset(&buf[i*512], 0, batch*512); + i += batch; + b += batch * 512; + block += batch; + nb_blocks -= batch; + zero_blocks += batch; + VIDDBG(10, "Skipping %d blocks\n", batch); + continue; + } + + VIDDBG(0, "Couldn't map block %d (%d)\n", + block, res); + if (pool) + vdisk_free(pool); + return (-1*res); + } + + VIDDBG(50, "mapped sector %" PRId64 " to block %d for read\n", + block, real_block); + + // Offset in the file + offset = (uint64_t)real_block << 9; + + if (use_aio) + vdisk->asyncio++; + + // Perform IO + if (op == VDISK_WRITE) { + if (pool) + memcpy(b, &buf[i*512], batch * 512); + if (!use_aio) + bytes = vdisk_syncio(vdf->fd, b, batch * 512, + offset, VDISK_WRITE, vdf->syncio); + else + bytes = vdisk_asyncio(vdisk, block, vdf->fd, + b, batch * 512, offset, + arg, aiocb, VDISK_WRITE); + } else /* VDISK_READ */ { + if (!use_aio) { + bytes = vdisk_syncio(vdf->fd, b, batch * 512, + offset, VDISK_READ, NULL); + if (pool) + memcpy(&buf[i*512], b, batch * 512); + } else { + bytes = vdisk_asyncio(vdisk, block, vdf->fd, + b, batch * 512, offset, + arg, aiocb, VDISK_READ); + } + } + + if (bytes != batch * 512) { + VIDDBG(0, "%s %ld bytes (block %d) instead of " + "%d (%s)\n", (op==VDISK_WRITE)?"Wrote":"Read", + bytes, real_block, batch * 512, vdf->name); + if ((signed long)bytes == -1) + res = errno; + } + + if (!use_aio) + if (vdisk->vdfd->xfer_commit(arg, res)) + VIDDBG(0, "Couldn't commit transfer\n"); + + i += batch; + b += batch * 512; + block += batch; + nb_blocks -= batch; + } + + if (pool) + vdisk_free(pool); + + /* + * Returning number of processed bytes to caller who requested AIO + * (vdisk->use_aio && aiocb) will tell him that there is no + * need to wait for AIO completion + * There are two cases when this happens: + * - We couldn't perform any AIOs (use_aio == 0) + * - Some requests have been reads to unallocated blocks (and + * thus are read as zeroes). Note that if *some* blocks have been + * sent as AIOs, the caller will need to wait for completions + * (and we return zero). + */ + if (!use_aio) + return (i * 512); // 'i' is number of accessed sectors; + else if (vdisk->use_aio && aiocb && (zero_blocks != 0)) + return (zero_blocks * 512); + else + return (0); +} + +LIST_HEAD(vdfd_head); + +// Register new file format +int +vdisk_register(vdf_data_t *new_vdfd) +{ + struct list_head *ptr; + vdf_data_t *vdfd; + + list_for_each(ptr, &vdfd_head) { + vdfd = list_entry(ptr, vdf_data_t, vdfd_list); + if (vdfd == new_vdfd) { + return (-1); + } + } + + list_add(&new_vdfd->vdfd_list, &vdfd_head); + VIDDBG(10, "Registered \"%s\" format\n", new_vdfd->ftype); + return (0); +} + +// Unregister file format +void +vdisk_unregister(vdf_data_t *vdfd) +{ + struct list_head *ptr; + + list_for_each(ptr, &vdfd_head) { + if (vdfd == list_entry(ptr, vdf_data_t, vdfd_list)) { + list_del(&vdfd->vdfd_list); + break; + } + } +} + +// Find format-specific library, load it and call its init routine +int +vdisk_init_format(char *name) +{ + void *handle; + char libname[64]; + char initfunc[32]; + void (*init)(); + char *err; + + // Construct library name + (void)strcpy(libname, "libvdisk_"); + (void)strcat(libname, name); + (void)strcat(libname, ".so"); + + handle = dlopen (libname, RTLD_LAZY); + if (!handle) { + VIDDBG(0, "%s\n", dlerror()); + return (-1); + } + + dlerror(); // Clear any existing error + + // Construct init function name + (void)strcpy(initfunc, name); + (void)strcat(initfunc, "_init"); + + *(void **) (&init) = dlsym(handle, initfunc); + if ((err = dlerror()) != NULL) { + VIDDBG(0, "%s\n", err); + return (-1); + } + + // Call format-specific init routine + (*init)(); + + return (0); +} + +int +vdf_find_vdfd(vdisk_dev_t *vdisk, char *ftype) +{ + struct list_head *ptr; + vdf_data_t *vdfd; + int err; + int attempt = 0; + + while (attempt < 2) { + list_for_each(ptr, &vdfd_head) { + + vdfd = list_entry(ptr, vdf_data_t, vdfd_list); + + if (!strcmp(vdfd->ftype, ftype)) { + + vdisk->vdfd = vdfd; + return (0); + } + } + + if (attempt) { + VIDDBG(0, "Unknown format %s\n", ftype); + return (EINVAL); + } + + // Didn't find vdfd for this extension, maybe we need + // to initialize it and try again. + err = vdisk_init_format(ftype); + if (err != 0) { + VIDDBG(0, "Can't initialize format %s\n", ftype); + return (err); + } + attempt++; + } + + /*NOTREACHED*/ + return (EINVAL); +} + +int +vdisk_common_init(vdisk_dev_t *vdisk) +{ + if (vdisk_initialized) + return (0); + + INIT_LIST_HEAD(&vdfd_head); + + vdisk_pagesz = getpagesize(); + + vdisk_initialized = 1; + + return (0); +} diff -r 75c61490cc06 tools/vdisk/vdisk_tool.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vdisk_tool.c Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,338 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#define _GNU_SOURCE // for strndup() +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> +#include <stdint.h> +#include <getopt.h> + +#include "vdisk.h" + +extern int vdisk_init_format(char *); +static char *supported_formats[] = {"vhd", NULL}; + +int +init_tool() +{ + int err; + int i; + + err = vdisk_common_init(NULL/*XXX: ?? */); + if (err) { + VIDDBG(0, "Failed to initialize vdisk\n"); + return (err); + } + + for (i=0; ;i++) { + if (supported_formats[i] == NULL) + break; + + err = vdisk_init_format(supported_formats[i]); + if (err) { + VIDDBG(0, "Failed to initialize %s format\n", + supported_formats[i]); + return (err); + } + } + return (0); +} + +static void +print_usage(char *prog) +{ + int i; + + fprintf(stderr, "Usage: %s OPTIONS -# <format-specific options> " + "<filename>\n", prog); + fprintf(stderr, + " OPTIONS:\n" + " [-f <format>] [-C] [-H] [-M] " + "[-D <block> [-b <num_blocks>] [-o outfile]]\n" + " -C Create a vdisk\n" + " -H Read vdisk headers from file\n" + " -M Modify a vdisk\n" + " -D Dump a vhd\n" + " block first block to read (required)\n" + " num_blocks number of blocks to read. If not\n" + " specified, whole file will be read\n" + " outfile output file. If not specified,\n" + " stdout is used\n" + " Supported formats: "); + for (i=0; ;i++) { + if (supported_formats[i] == NULL) { + fprintf(stderr, "\n"); + break; + } + fprintf(stderr, "%s ", supported_formats[i]); + } +} + +int +main(int argc, char *argv[]) +{ + char filename[PATH_MAX]; + char *outfile = NULL; + char format[16] = "vhd"; + int operations = 0; + char c = 0; + extern char *optarg; + extern int optind, opterr, optopt; + vdisk_dev_t vdisk; + int err; + void *optp = NULL; // Format-specific options + char *file_fmt; + int i; + int first_block = 0, num_blocks = -1; + struct program_props props; + uint8_t flags; + + //init_tool(); + + /* + * Read the filename argument first -- we may need + * it to determine format + */ + strcpy(filename, argv[argc-1]); + file_fmt = strrchr(filename, '.'); + + // See whether what we think is file's format is supported + if (file_fmt) { + file_fmt++; // Skip '.' + for (i=0; ;i++) { + if (supported_formats[i] == NULL) { + // Not a supported format, ignore suffix + file_fmt = NULL; + break; + } + + if (!strcmp(file_fmt, supported_formats[i])) + break; // Found it + } + } + + vdisk.vdfd = NULL; + + while (c != '#') { + + c = getopt(argc, argv, "f:CHMD:b:o:#"); + if (c == -1) + break; + + switch (c) { + case 'f': + strcpy(format, optarg); + + /* + * If we either coudn't determine format from filename + * argument or we thought we could but '-f' specifies + * different format, we append appropriate suffix + */ + if (!file_fmt || strcmp(format, file_fmt)) { + (void)strcat(filename, "."); + (void)strcat(filename, format); + file_fmt = format; + } + + break; + case 'C': + operations |= VDISK_OP_CREATE; + break; + case 'H': + /* File to read headers from */ + operations |= VDISK_OP_HEADERS; + break; + case 'M': + /* File to read headers from */ + operations |= VDISK_OP_MODIFY; + break; + case 'D': + first_block = atol(optarg); + operations |= VDISK_OP_DUMP; + break; + case 'b': + num_blocks = atol(optarg); + if (num_blocks < 0) { + VIDDBG(0, "Number of blocks must be a " + "non-negative number\n"); + exit(1); + } + break; + case 'o': + // Don't confuse vdisk with output file + if (optarg == argv[argc-1]) { + print_usage(argv[0]); + exit(1); + } + outfile = strndup(optarg, strlen(optarg)); + if (outfile == NULL) { + VIDDBG(0, "Out of memory\n"); + exit(1); + } + + case '#': + + if (file_fmt) { + err = vdf_find_vdfd(&vdisk, file_fmt); + if (err) { + VIDDBG(0, "Fail to initialize " + "format data for %s\n", + format); + return (err); + } + } else { + VIDDBG(0, "Unspecified or unsupported format\n"); + print_usage(argv[0]); + return (EINVAL); + } + + if (vdisk.vdfd->parse_args(argc, operations, + argv, &optp) != 0) { + print_usage(argv[0]); + return (EINVAL); + } + + break; + default: + print_usage(argv[0]); + return (EINVAL); + } + } + + /* + * At least one operation type is needed and + * filename needs to be specified + */ + if (!operations || !file_fmt) { + print_usage(argv[0]); + return (EINVAL); + } + + // XXX: We probably should have initialized by now + if (vdisk.vdfd == NULL) { + err = vdf_find_vdfd(&vdisk, file_fmt); + if (err) { + VIDDBG(0, "Fail to initialize format data for %s\n", + format); + return (err); + } + } + + + // First create file, if requested + if (operations & VDISK_OP_CREATE) { + err = vdisk.vdfd->create_vdisk(filename, optp); + if (err) { + VIDDBG(0, "Can't create file\n"); + return (err); + } + } + + props.alloc_func = NULL; + props.free_func = NULL; + props.out_target = VDISK_OUT_STDERR; + + if (!(operations & VDISK_OP_CREATE) && + !(operations & VDISK_OP_MODIFY)) + flags = VDISK_RO; + else + flags = 0; + + err = vdisk_init(&vdisk, filename, &props, flags); + if (err) { + VIDDBG(0, "Fail to initialize from file %s\n", + format); + return (err); + } + + if (operations & VDISK_OP_HEADERS) { + err = vdf_print_headers(&vdisk, filename); + if (err) { + VIDDBG(0, "Can't read headers\n"); + return (err); + } + } + + if (operations & VDISK_OP_MODIFY) { + err = vdisk.vdfd->modify_vdisk(&vdisk, optp); + if (err) { + VIDDBG(0, "Can't modify headers\n"); + return (err); + } + } + + if (operations & VDISK_OP_DUMP) { + uint8_t *buf, *p; + int bytes; + int chunk_log = 21; // 2MB + int nblocks; + int fd; + + // Open output file (use stdout if not specified) + if (outfile != NULL) { + fd = open(outfile, O_RDWR|O_CREAT, + S_IRUSR|S_IWUSR); + if (fd == -1) { + VIDDBG(0, "Can't open %s: %s\n", + outfile, strerror(errno)); + exit(1); + } + } else + fd = 1; // stdout + + // Allocate 512b-aligned read buffer + p = malloc((1<<chunk_log) + 512); + while (p == NULL) { // Try smaller chunks if we fail + if (chunk_log == 0) { + VIDDBG(0, "Can't allocate buffer\n"); + exit(1); + } + chunk_log--; + p = malloc((1<<chunk_log) + 512); + } + buf = p; + while ((addr_t)buf & 511) buf++; + + // nblocks per transfer + nblocks = (1<<chunk_log) >> 9; + + // If number of blocks to read is not specified, + // read whole vdisk + if (num_blocks < 0) + num_blocks = vdisk.sz >> 9; + + for (i=0; i<num_blocks; i+=nblocks) { + + // This could happen on last iteration + if ((i+nblocks) > num_blocks) + nblocks = num_blocks - i; + + bytes = vdisk_rw(&vdisk, first_block+i, buf, nblocks, + VDISK_READ, NULL); + if (bytes != (nblocks << 9)) { + VIDDBG(0, "vdisk_rw() returned %d\n", bytes); + exit(1); + } + + bytes = write(fd, buf, nblocks<<9); + if (bytes == -1) { + VIDDBG(0, "write: %s\n", strerror(errno)); + exit(1); + } + } + + free(p); + } + return 0; +} diff -r 75c61490cc06 tools/vdisk/vdisk_utils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vdisk_utils.c Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,435 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#define _GNU_SOURCE // for O_DIRECT +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <string.h> +#include <stdarg.h> +#include <libaio.h> +#include <time.h> +#include <limits.h> + +#include "list.h" +#include "vdisk.h" +#include "vdisk_utils.h" + +#define VDISK_MAX_ERRORS (100) +#define VDISK_ERR_STRING_LEN (512) + +static void *(*vdisk_alloc_func)(size_t sz) = malloc; +static void (*vdisk_free_func)(void *ptr) = free; + +int vdisk_dbg_level = 1; +int vdisk_out_target = VDISK_OUT_STDERR; // where to print messages + +// Don't want to keep this on stack +static char argstring[VDISK_ERR_STRING_LEN]; + +// Data structure to help with message throttling +struct vdisk_log_mgt { + int vdisk_err_cnt; + int interval; + int restart; + time_t last_error; + time_t next_check; +}; +static struct vdisk_log_mgt vdisk_log = { + .vdisk_err_cnt = 0, + .restart = 0, + .interval = 0, + .last_error = (time_t)0, + .next_check = (time_t)LONG_MAX, +}; + +// Print the message to either syslog or stderr, optionally +// specifying filename and line number +static void +vdisk_print_msg(char *file, int line, char *msg) +{ + if (file) { + if (vdisk_out_target == VDISK_OUT_SYSLOG) { + syslog(LOG_DEBUG, "%s:%d: %s", + file, line, msg); + } else { + fprintf(stderr, "%s:%d: %s", + file, line, msg); + } + } else { + if (vdisk_out_target == VDISK_OUT_SYSLOG) { + syslog(LOG_DEBUG, "%s", msg); + } else { + fprintf(stderr, "%s", msg); + } + } +} + +void +vdisk_log_error(int level, char *file, int line, char *fmt, ...) +{ + int print_msg; + + if (level > vdisk_dbg_level) + return; + + print_msg = 0; + + // Decide whether to print the message. + // Only manage message reporting for level 0, which is + // usually reserved for errors. Other messages will be + // printed unconditionally. + if (level == 0) { + time_t now; + + if (time(&now) == (time_t)-1) { + // This should never happen ;-() + vdisk_print_msg(NULL, 0, "vdisk: Can't get time, " + "error reporting stopped\n"); + return; // XXX: Or continue? + } + + if (now >= vdisk_log.next_check) { + + if (now - vdisk_log.last_error > + (time_t)vdisk_log.interval) { + // reset message throttling + vdisk_log.restart = 0; + vdisk_log.interval = 0; + vdisk_log.vdisk_err_cnt = 0; + vdisk_log.next_check = LONG_MAX; + vdisk_print_msg(NULL, 0, "vdisk: Restoring " + "error reporting\n"); + } + + if (vdisk_log.restart) { + // Double the interval, max at 128 seconds + vdisk_log.interval = (vdisk_log.interval > 64) ? + vdisk_log.interval : + (vdisk_log.interval * 2); + vdisk_log.next_check += + (time_t)vdisk_log.interval; + vdisk_log.restart = 0; + } + + vdisk_log.vdisk_err_cnt = 0; + + } else { + // Message received during throttling interval. + // We will need to double the interval later + vdisk_log.restart = 1; + } + + if (vdisk_log.vdisk_err_cnt < VDISK_MAX_ERRORS) { + vdisk_log.vdisk_err_cnt++; + print_msg = 1; + } + + if (vdisk_log.vdisk_err_cnt == VDISK_MAX_ERRORS) { + vdisk_log.vdisk_err_cnt++; + if (vdisk_log.interval == 0) { + // Start interval management + vdisk_print_msg(NULL, 0, "vdisk: Too many " + "errors, slowing down rate " + "of reporting\n"); + vdisk_log.interval = 1; + vdisk_log.next_check = now + + (time_t)vdisk_log.interval; + } + } + + vdisk_log.last_error = now; + + } else + print_msg = 1; + + + if (print_msg) { + va_list args; + + // Roll arguments into a string + va_start(args, fmt); + (void)vsnprintf(argstring, VDISK_ERR_STRING_LEN, + fmt, args); + va_end(args); + + vdisk_print_msg(file, line, argstring); + } +} + +void +vdisk_alloc_init(void *alloc_func, void *free_func) +{ + if (alloc_func != NULL) + vdisk_alloc_func = alloc_func; + + if (free_func != NULL) + vdisk_free_func = free_func; +} + +void * +vdisk_malloc(size_t sz) +{ + void *ptr; + + ptr = vdisk_alloc_func(sz); + if (ptr) + memset(ptr, 0, sz); + return (ptr); +} + +void +vdisk_free(void *ptr) +{ + vdisk_free_func(ptr); + ptr = NULL; +} + +int +vdisk_close(int fp) +{ + int err; + + err = fsync(fp); + if (err) + VIDDBG(0, "fsync(): %s\n", strerror(errno)); + + // Invalidate all pages from page cache + err = posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED); + if (err) + VIDDBG(0, "posix_fadvise(): %s\n", strerror(errno)); + + err = close(fp); + return (err); +} + +size_t +vdisk_size(int f, size_t *sz) +{ + size_t cur; + int err; + + /* + * XXX: Obviously, we should use fstat(). Unfortunately, I couldn't + * figure out how to make a dynamic library that calls fstat. + * See glibc FAQ for descritpion of *problem* (why couldn't they + * provide a solution as well?) + */ + + // Remember current position + cur = lseek(f, 0, SEEK_CUR); + if (cur == -1) { + err = errno; + VIDDBG(0, "lseek: Can't seek to current: %s\n", strerror(errno)); + return (err); + } + + *sz = lseek(f, 0, SEEK_END); + if (*sz == -1) { + err = errno; + VIDDBG(0, "lseek: Can't seek to end: %s\n", strerror(errno)); + return (err); + } + + // Restore current position + cur = lseek(f, 0, SEEK_SET); + if (cur == -1) { + err = errno; + VIDDBG(0, "lseek: Can't seek to current: %s\n", strerror(errno)); + return (err); + } + + return (0); +} + +size_t +vdisk_asyncio(vdisk_dev_t *vdisk, uint64_t block, + int fp, char *buf, + size_t size, off_t off, + void *arg, void *aiocb, + int op) +{ + int hash_index = VDISK_HASH_IDX(block); + struct iocb *io; + struct pending_aio *pio; + + + ASSERT(vdisk->aio_cnt < VDISK_HASH_SZ); + ASSERT(vdisk->hash[hash_index].key == block); + + io = &vdisk->hash[hash_index].io; + pio = &vdisk->hash[hash_index].pio; + + pio->arg = arg; + pio->block = block; + pio->aiocb = aiocb; + pio->num_blocks = size>>9; + pio->off = off; + pio->fd = fp; + pio->op = op; + + if (op == VDISK_WRITE) + io_prep_pwrite(io, fp, buf, size, off); + else + io_prep_pread(io, fp, buf, size, off); + + io->data = pio; + + VIDDBG(50, "Using hash entry %d (block %d)\n", + VDISK_HASH_IDX(pio->block), pio->block); + + vdisk->aio_submit[vdisk->aio_cnt++] = io; + + return (size); +} + +static void +vdisk_manage_pcache(int fp, vdisk_syncio_t *syncio, off_t start, off_t len) +{ +#define WRITE_RUN (1<<22) //4MB + int res; + DO_STATS(time_t now); + + DO_STATS(++(syncio->total_writes)); + + if (syncio->is_set) { + if (start >= syncio->io_start && + start <= syncio->io_start + syncio->io_len) { + syncio->io_len -= (syncio->io_start + + syncio->io_len) - start; + syncio->io_len += len; + DO_STATS(++(syncio->contig_writes)); + if (syncio->io_len > WRITE_RUN) { + DO_STATS(++(syncio->flush_size_force)); + + syncio->is_set = 0; + + res = fsync(fp); + if (res) + VIDDBG(0, "fsync: %s\n", + strerror(errno)); + + res = posix_fadvise(fp, syncio->io_start, + syncio->io_len, + POSIX_FADV_DONTNEED); + if (res) + VIDDBG(0, "posix_fadvise: %s\n", + strerror(errno)); + } + len = 0; // NOTE:len is consumed into previous + } else { +#if VDISK_SYNCIO_STATS + if (syncio->io_len < (1<<20)) + ++(syncio->flush_size_sub1MB); + else if (syncio->io_len < (1<<21)) + ++(syncio->flush_size_sub2MB); + else if (syncio->io_len < (1<<22)) + ++(syncio->flush_size_sub4MB); + else if (syncio->io_len < (1<<23)) + ++(syncio->flush_size_sub8MB); + else + ++(syncio->flush_size_ovr8MB); +#endif /* VDISK_SYNCIO_STATS */ + syncio->is_set = 0; + res = fsync(fp); + if (res) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + res = posix_fadvise(fp, syncio->io_start, + syncio->io_len, + POSIX_FADV_DONTNEED); + if (res) + VIDDBG(0, "posix_fadvise: %s\n", + strerror(errno)); + } + } + if (len > 0) { + if (len <= WRITE_RUN) { + syncio->is_set = 1; + syncio->io_start = start; + syncio->io_len = len; + } else { + DO_STATS(++(syncio->flush_size_force)); + res = fsync(fp); + if (res) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + res = posix_fadvise(fp, start, len, + POSIX_FADV_DONTNEED); + if (res) + VIDDBG(0, "posix_fadvise: %s\n", + strerror(errno)); + } + } +#if VDISK_SYNCIO_STATS + now = time(NULL); + if (now >= syncio->last_dbg_print + 60) { + VIDDBG(0, ":WRITE_PERF: [%lu] tWrts %lu | conWrts %lu | s1M %lu" + " | s2M %lu | s4M %lu | s8M %lu | o8M %lu | f %lu\n", + (unsigned long)(now - syncio->last_dbg_print), + syncio->total_writes, syncio->contig_writes, + syncio->flush_size_sub1MB, syncio->flush_size_sub2MB, + syncio->flush_size_sub4MB, syncio->flush_size_sub8MB, + syncio->flush_size_ovr8MB, syncio->flush_size_force); + syncio->last_dbg_print = now; + } +#endif /* VDISK_SYNCIO_STATS */ +} + +size_t +vdisk_syncio(int fp, char *buf, size_t size, off_t off, int op, + vdisk_syncio_t *syncio) +{ + size_t bytes; + off_t res; + off_t io_start; + off_t io_len; + + ASSERT(!(size & 511)); + ASSERT(!(off & 511)); + ASSERT(!((addr_t)buf & 511)); + + res = vdisk_seek(fp, off, SEEK_SET); + if (res != off) { + VIDDBG(0, "lseek couldn't set offset to 0x%" PRIx64 ": %s\n", + off, strerror(errno)); + return (-1); + } + + if (op == VDISK_WRITE) { + bytes = write(fp, buf, size); + } else + bytes = read(fp, buf, size); + + if (bytes != size) { + VIDDBG(0, "%s %zd bytes instead of %zd: %s\n", + (op == VDISK_WRITE)?"Wrote":"Read", + bytes, size, strerror(errno)); + } + + io_start = (off & (~((off_t)vdisk_pagesz-1))); + io_len = (size + vdisk_pagesz); + + if (op == VDISK_READ) { + res = posix_fadvise(fp, io_start, io_len, POSIX_FADV_DONTNEED); + if (res) + VIDDBG(0, "posix_fadvise: %s\n", strerror(errno)); + } else if (syncio) { + vdisk_manage_pcache(fp, syncio, io_start, io_len); + } else { + res = fsync(fp); + if (res) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + res = posix_fadvise(fp, io_start, io_len, POSIX_FADV_DONTNEED); + if (res) + VIDDBG(0, "posix_fadvise: %s\n", strerror(errno)); + } + + return (bytes); +} diff -r 75c61490cc06 tools/vdisk/vdisk_utils.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vdisk_utils.h Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,36 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#ifndef __VDISK_UTILS +#define __VDISK_UTILS + + +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <unistd.h> +#include <errno.h> + +#include "vdisk.h" + + + +#define vdisk_open(cp, fl, mode) open((cp), (fl), (mode)) +#define vdisk_seek(fp, off, whence) lseek64((fp), (off), (whence)) + +extern void *vdisk_malloc(size_t sz); +extern void vdisk_free(void *ptr); +extern int vdisk_close(int fp); +extern size_t vdisk_size(int f, size_t *sz); +extern size_t vdisk_syncio(int fp, char *buf, size_t sz, loff_t off, + int op, vdisk_syncio_t *syncio); +extern size_t vdisk_asyncio(vdisk_dev_t *, uint64_t, int, char *, size_t, + loff_t, void *, void *, int); + + +#endif /* __VDISK_UTILS */ diff -r 75c61490cc06 tools/vdisk/vhd.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vhd.c Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,925 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#define _GNU_SOURCE // for O_DIRECT +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "list.h" +#include "vdisk.h" +#include "vdisk_utils.h" +#include "vhd.h" +#include "vhd_footer.h" + +char __vhd_zeroes[VHD_FTR_SZ+512]; +char *vhd_zeroes; + +#define BLOCK_MASK (~(((addr_t)1<<9)-1)) + + +int +vhd_verify_metadata(vd_file_t *vf) +{ + // XXX: Something more robust, maybe? + return (0); +} + +int +vhd_read_footer(vd_file_t *vf) +{ + off_t ftr_off, res; + vhd_file_t *vhd = vf->vdf; + size_t bytes; + + if (vhd->ftr_mem == NULL) { + vhd->ftr_mem = vdisk_malloc(VHD_FTR_SZ+512); + if (vhd->ftr_mem == NULL) { + VIDDBG(0, "Couldn't allocate dynamic header\n"); + return (ENOMEM); + } + vhd->ftr = vhd->ftr_mem; + while ((addr_t)vhd->ftr & 511) vhd->ftr++; + } + + /* Find file size (seek to the end) */ + res = vdisk_seek(vf->fd, 0, SEEK_END); + if (res == -1) { + VIDDBG(0, "lseek couldn't set offset to end of file\n"); + vdisk_free(vhd->ftr_mem); + return (-1); + } + + ftr_off = res - 512; + + res = vdisk_seek(vf->fd, ftr_off, SEEK_SET); + if (res != ftr_off) { + VIDDBG(0, "lseek couldn't set offset to 0x%" PRIx64 "\n", + ftr_off); + vdisk_free(vhd->ftr_mem); + return (-1); + } + + if ((bytes = vdisk_syncio(vf->fd, vhd->ftr, 512, ftr_off, + VDISK_READ, NULL)) != 512) { + VIDDBG(0, "vdisk read from offset 0x%" PRIx64 " failed " + "(read %zd insted of 512) %d\n", + ftr_off, bytes, errno); + vdisk_free(vhd->ftr_mem); + return (-1); + } + + return 0; +} + +int +vhd_read_dynhdr(vd_file_t *vf) +{ + off_t res; + vhd_file_t *vhd = vf->vdf; + size_t bat_sz; + int err = 0; + + + vhd->dhdr_mem = vdisk_malloc(VHD_DHDR_SZ+512); + if (vhd->dhdr_mem == NULL) { + VIDDBG(0, "Couldn't allocate dynamic header\n"); + err = ENOMEM; + goto fail; + } + + vhd->dhdr = vhd->dhdr_mem; + while ((addr_t)vhd->dhdr & 511) vhd->dhdr++; + + // Skip copy of the footer + res = vdisk_seek(vf->fd, VHD_FTR_SZ, SEEK_SET); + if (res != VHD_FTR_SZ) { + VIDDBG(0, "Couldn't skip copy of the footer\n"); + err = -1; + goto fail; + } + + if (vdisk_syncio(vf->fd, vhd->dhdr, VHD_DHDR_SZ, VHD_FTR_SZ, VDISK_READ, NULL) + != VHD_DHDR_SZ) { + VIDDBG(0, "Failed to read dynamic header"); + err = -1; + goto fail; + } + + // Read BAT (in 512B units) + // XXX: May need to only keep a part of BAT due to memory size concerns + bat_sz = vhd_get_dhdr_tbl_entries(vhd->dhdr) << 2; + if (bat_sz & 511) + bat_sz += (512-(bat_sz & 511)); + + vhd->bat_mem = vdisk_malloc(bat_sz+512); + if (vhd->bat_mem == NULL) { + VIDDBG(0, "Couldn't allocate BAT\n"); + err = ENOMEM; + goto fail; + } + vhd->bat = vhd->bat_mem; + while ((addr_t)vhd->bat & 511) vhd->bat++; + + if (vdisk_syncio(vf->fd, (char *)vhd->bat, bat_sz, + VHD_DHDR_SZ+VHD_FTR_SZ, VDISK_READ, NULL) != bat_sz) { + VIDDBG(0, "Failed to read BAT"); + err = -1; + goto fail; + } + + return (0); + +fail: + return (err); +} + +int +vhd_read_metadata(vdisk_dev_t *vdisk, vd_file_t *vf) +{ + int err; + int type; + vhd_file_t *vhd = NULL; + int secs_per_block; + uint32_t geom; + + vf->vdf = (vhd_file_t *)vdisk_malloc(sizeof(vhd_file_t)); + if (vf->vdf == NULL) { + VIDDBG(0, "Couldn't allocate format-specific data\n"); + err = ENOMEM; + goto fail; + } + + vhd = vf->vdf; + memset(vhd, 0, sizeof(vhd_file_t)); + + err = vhd_read_footer(vf); + if (err) { + VIDDBG(0, "Couldn't read footer\n"); + goto fail; + } + + vdisk->sz = vhd_get_ftr_orig_sz(vhd->ftr); + + type = vhd_get_ftr_type(vhd->ftr); + if ( (type != VHD_TYPE_FIXED) && + (type != VHD_TYPE_DYNAMIC) && + (type != VHD_TYPE_DIFF)){ + // Return error for VHD_TYPE_NONE as well. + VIDDBG(0, "Unsupported VHD file type (%d)\n", type); + err = EIO; // XXX: Something else? + goto fail; + } + + if (type != VHD_TYPE_FIXED) { + size_t sz; + int i; + + // We should have a dynamic header + err = vhd_read_dynhdr(vf); + if (err) { + VIDDBG(0, "Couldn't read dynamic header\n"); + goto fail; + } + + // No fls() in userland, so we do log2 ourselves + vhd->sec_per_block_log = 0; + secs_per_block = vhd_get_dhdr_blksz(vhd->dhdr) >> 9; + while (secs_per_block >>= 1) + vhd->sec_per_block_log++; + + if (type == VHD_TYPE_DYNAMIC) { + // How many sectors are mapped sequentially + vf->batch_sz = (1<<vhd->sec_per_block_log); + } else { + // XXX: Need to think about this. + vf->batch_sz = 1; + } + + // bytes for sectormap is ((sectors per block) / 8) + vhd->sectormap_sz = (vhd_get_dhdr_blksz(vhd->dhdr) >> 9) >> 3; + + // Align on 512-byte boundary + if ((vhd->sectormap_sz == 0) || (vhd->sectormap_sz & 511)) + vhd->sectormap_sz += 512 - (vhd->sectormap_sz & 511); + + // First new block will be allocated where the footer + // currently is, which is at the end of the file + err = vdisk_size(vf->fd, &sz); + if (err) { + VIDDBG(0, "Couldn't get file size\n"); + goto fail; + } + vhd->next_block_off = (sz-VHD_FTR_SZ) >> 9; + + // Allocate sectormap buffer + vhd->sec_mem = vdisk_malloc(512*2); + if (vhd->sec_mem == NULL) { + //XXX: free everything + VIDDBG(0, "Can't allocate sectormap\n"); + err = ENOMEM; + goto fail; + } + vhd->secmap_chunk = vhd->sec_mem; + while ((addr_t)vhd->secmap_chunk & 511) vhd->secmap_chunk++; + + // Allocate sectormap cache + for (i=0;i<VHD_CACHE_SZ;i++) { + vhd->cache[i].sec_mem = vdisk_malloc(512*2); + if (vhd->cache[i].sec_mem == NULL) { + //XXX: free everything + VIDDBG(0, "Can't allocate sectormap\n"); + err = ENOMEM; + goto fail; + } + vhd->cache[i].secmap_chunk = vhd->cache[i].sec_mem; + while ((addr_t)vhd->cache[i].secmap_chunk & 511) + vhd->cache[i].secmap_chunk++; + + // Point to sector 0 (or any other sector), + // but make the map empty + vhd->cache[i].first_sector = 0; //VHD_INVALID_SECTOR; + memset(vhd->cache[i].secmap_chunk, 0, 512); + } + + if (VHD_CACHE_SZ > 0) { + vhd->cache_head = &vhd->cache[0]; + vhd->cache[0].prev = NULL; + for (i=1;i<VHD_CACHE_SZ;i++) { + vhd->cache[i-1].next = &vhd->cache[i]; + vhd->cache[i].prev = &vhd->cache[i-1]; + } + vhd->cache_tail = &vhd->cache[VHD_CACHE_SZ-1]; + vhd->cache[VHD_CACHE_SZ-1].next = NULL; + } //else + //vhd->cache_head == NULL; + } else + vf->batch_sz = (1<<30); // (signed) infinity + + vf->flags = 0; + + err = vhd_verify_metadata(vf); + if (err) { + VIDDBG(0, "File appears to be corrupted\n"); + + // XXX: It may be salvageable + if (type != VHD_TYPE_FIXED) { + vdisk_free(vhd->dhdr_mem); + vdisk_free(vhd->bat_mem); + vdisk_free(vhd->sec_mem); + } + err = EIO; + goto fail; + } + + // We are assuming here that all files of the + // vdisk have the same geometry. + geom = vhd_get_ftr_geom(vhd->ftr); + vdisk->geom.cyls = (geom >> 16) & 0xffff; + vdisk->geom.heads = (geom >> 8) & 0xff; + vdisk->geom.secs = geom & 0xff; + + return (0); + +fail: + if (vhd) { + if (vhd->ftr_mem) + vdisk_free(vhd->ftr_mem); + if (vhd->dhdr_mem) + vdisk_free(vhd->dhdr_mem); + if (vhd->bat_mem) + vdisk_free(vhd->bat_mem); + if (vhd->sec_mem) + vdisk_free(vhd->sec_mem); + vdisk_free(vhd); + } + return (err); +} + +int +vhd_alloc_block(vd_file_t *vf, uint32_t blockno) +{ + size_t bytes; + off_t bat_off; + char *ptr; + vhd_file_t *vhd = vf->vdf; + size_t blocksz; + + + ASSERT(__arch__swab32(vhd->bat[blockno]) == VHD_BAT_INVALID_ENTRY); + ASSERT((vhd_get_dhdr_blksz(vhd->dhdr) & 511) == 0); + ASSERT((vhd->sectormap_sz & 511) == 0); + + blocksz = vhd_get_dhdr_blksz(vhd->dhdr) + vhd->sectormap_sz; + + /* + * First try to write footer at new position. + * The hole should be filled with zeroes + * XXX: Are we sure? + */ + bytes = vdisk_syncio(vf->fd, vhd->ftr, VHD_FTR_SZ, + (vhd->next_block_off<<9) + blocksz, + VDISK_WRITE, NULL); + if (bytes != VHD_FTR_SZ) { + VIDDBG(0, "Can't append footer\n"); + return (EIO); + } + + + // Overwrite footer with zeroes + bytes = vdisk_syncio(vf->fd, vhd_zeroes, VHD_FTR_SZ, + vhd->next_block_off<<9, VDISK_WRITE, NULL); + if (bytes != VHD_FTR_SZ) { + VIDDBG(0, "Can't overwrite footer\n"); + return (EIO); + } + + // Now update BAT in a 512-b chunk + vhd->bat[blockno] = __arch__swab32(vhd->next_block_off); + bat_off = (VHD_FTR_SZ + VHD_DHDR_SZ + (blockno<<2)) & BLOCK_MASK; + ptr = (char *)(((addr_t)&vhd->bat[blockno]) & BLOCK_MASK); + bytes = vdisk_syncio(vf->fd, ptr, 512, bat_off, VDISK_WRITE, NULL); + if (bytes != 512) { + VIDDBG(0, "Can't update BAT\n"); + return (EIO); + } + + vhd->next_block_off += (blocksz >> 9); + + return(0); +} + +/* + * It would be easier to use test_bit()/set_bit() routines, + * but x86 bit test/set instructions count bits (in the last byte) + * from LSb, which is not what we want. We could recompute pos + * (pos=(pos&(~7))+7-(pos&7)) but doing this operation more + * explicitely seems to be safer. + */ +inline int +vhd_test_bit(int pos, char *buf) +{ + char *addr = (char *)((addr_t)buf + (pos>>3)); + uint8_t byte = *addr; + uint8_t bitinbyte = 7-(pos&7); + + return (byte & (1<<bitinbyte)); +} + +inline int +vhd_test_bitset(int start, int bits, char *buf) +{ + int i; + + for (i=0;i<bits;i++) + if (!vhd_test_bit(start+i, buf)) + return (0); + + return (1); +} + +inline void +vhd_set_bit(int pos, char *buf) +{ + char *addr = (char *)((addr_t)buf + (pos>>3)); + uint8_t byte = *addr; + uint8_t bitinbyte = 7-(pos&7); + + *addr = byte | (1<<bitinbyte); +} + +inline void +vhd_set_bitset(int start, int bits, char *buf) +{ + int i; + + for (i=0;i<bits;i++) + vhd_set_bit(start+i, buf); +} + + +int +vhd_xfer_commit(void *arg, int err) +{ + vhd_xfer_t *vhdx = arg; + size_t bytes; + + if (arg == NULL) + return (0); + + if (err == 0) { + + // Read the 512b chunk of sector map + bytes = vdisk_syncio(vhdx->fd, vhdx->secmap_chunk, 512, + vhdx->secmap_addr, VDISK_READ, NULL); + if (bytes != 512) { + VIDDBG(0, "Failed to read sector bitmap\n"); + vdisk_free(vhdx->mem); + return (EIO); + } + + // Set sector bit + vhd_set_bitset(vhdx->sector_bit, vhdx->num_secs, + vhdx->secmap_chunk); + + // and write it back + bytes = vdisk_syncio(vhdx->fd, vhdx->secmap_chunk, 512, + vhdx->secmap_addr, VDISK_WRITE, NULL); + if (bytes != 512) { + VIDDBG(0, "Can't commit access\n"); + vdisk_free(vhdx->mem); + return (EIO); + } + + if (vhdx->cache && vhdx->first_sector != VHD_INVALID_SECTOR) { + ASSERT(vhdx->cache->first_sector == VHD_INVALID_SECTOR); + memcpy(vhdx->cache->secmap_chunk, vhdx->secmap_chunk, 512); + vhdx->cache->first_sector = vhdx->first_sector; + } + } + + vdisk_free(vhdx->mem); + return (0); +} + +// Microsoft uses "sector" for 512-byte unit that we +// refer to as "block" elsewhere. +// This routine is *NOT* SMP-safe! +int +vhd_map_block(vd_file_t *vf, + uint32_t *sectorno, /* IN/OUT */ + int num_secs, + int op, + void **arg) +{ + vhd_file_t *vhd = vf->vdf; + int type = vhd_get_ftr_type(vhd->ftr); + uint32_t blockno; // block of sectors in the file + int err; + size_t bytes; + int sector_bit; // bit offset into 512b chunk of sectormap + int sector_in_block; + off_t sectormap_addr; + uint32_t first_sector; + vhd_cache_t *cache = vhd->cache_head; + + + if (type == VHD_TYPE_FIXED) + return (VID_BLOCK_MAPPED); + + vhd->stats.access++; + + blockno = *sectorno >> vhd->sec_per_block_log; + + // We can only map sequence on sectors in the same block + ASSERT(((*sectorno+num_secs-1) >> vhd->sec_per_block_log) + == blockno); + + // First sector in the block (really, blockno<<vhd->sec_per_block_log) + first_sector = *sectorno & (~(((uint32_t)1<<vhd->sec_per_block_log)-1)); + + // This sector's offset in the block + sector_in_block = *sectorno & (((uint32_t)1<<vhd->sec_per_block_log)-1); + + sector_bit = sector_in_block & ((512*8)-1); // 8 bits in a byte + while (cache != NULL) { + if (cache->first_sector == first_sector) { + // Sectormap is cached + if (vhd_test_bitset(sector_bit, num_secs, + cache->secmap_chunk)) { + + // sector is mapped + *sectorno = cache->phys_first_sector + + + sector_in_block; + + vhd->stats.cache_hit++; + + // Make the line LRU + if (cache->prev) { + cache->prev->next = cache->next; + if (cache->next) + cache->next->prev = + cache->prev; + else + vhd->cache_tail = cache->prev; + + cache->next = vhd->cache_head; + cache->next->prev = cache; + cache->prev = NULL; + vhd->cache_head = cache; + } + + return (VID_BLOCK_MAPPED); + } else { + break; + } + } + cache = cache->next; + } + + if (__arch__swab32(vhd->bat[blockno]) == VHD_BAT_INVALID_ENTRY) { + + // For reads, the caller will assume that + // read returned zeroes + if (op == VDISK_READ) + return (VID_BLOCK_NOTMAPPED); + + err = vhd_alloc_block(vf, blockno); + vhd->stats.block_alloc++; + VIDDBG(100, "Allocated block %d\n", blockno); + if (err) { + VIDDBG(0, "Failed to allocate block\n"); + return (err); + } + } + + if (VHD_CACHE_SZ > 0) { + + if (vhd->cache_tail->first_sector != VHD_INVALID_SECTOR) { + if ((cache == NULL) && (vhd->cache_head != NULL)) { + vhd_cache_t *oldh = vhd->cache_head; + vhd_cache_t *oldt = vhd->cache_tail; + + vhd->cache_head = oldt; + vhd->cache_tail = oldt->prev; + + vhd->cache_head->next = oldh; + oldh->prev = oldt; + + vhd->cache_head->prev = NULL; + + vhd->cache_tail->next = NULL; + + cache = vhd->cache_head; + } + + } else { + // tail cache fill is in-flight. We assume that + // all others are in-flight as well. + // We will not be caching + // XXX: we should probably walk the list + //first_sector = VHD_INVALID_SECTOR; + } + } + + // Read a block worth of sector bitmap + sectormap_addr = + ((uint64_t)__arch__swab32(vhd->bat[blockno])<<9) + + ((sector_in_block>>3) & BLOCK_MASK); + bytes = vdisk_syncio(vf->fd, vhd->secmap_chunk, 512, + sectormap_addr, VDISK_READ, NULL); + if (bytes != 512) { + VIDDBG(0, "Failed to read sector bitmap\n"); + return (EIO); + } + + // See whether the sector is present + if (!vhd_test_bitset(sector_bit, num_secs, vhd->secmap_chunk)) { + vhd_xfer_t *vhdx; + int byteaddr, bitno; + char *ptr; + + // For reads, the caller will assume that + // read returned zeroes + if (op == VDISK_READ) { + int i; + int mapped = 0; + + for (i=0; i<num_secs; i++) { + if (vhd_test_bit(sector_bit+i, + vhd->secmap_chunk)) { + mapped = 1; + break; + } + } + + if (!mapped) { + // None of blocks is mapped + return (VID_BLOCK_NOTMAPPED); + } else { + // Some blocks are mapped and some are not + return (VID_BLOCK_TOOBIG); + } + } + + byteaddr = sector_bit >> 3; // Find word in the map + bitno = sector_bit & 7; // Bit in the word + ASSERT(byteaddr<512); + + // sectormap is the first member and will be aligned + vhdx = vdisk_malloc(sizeof(vhd_xfer_t)+512); + if (vhdx == NULL) { + VIDDBG(0, "Failed to allocate commit data\n"); + return (EIO); + } + + ptr = (char *)vhdx; + while ((addr_t)ptr & 511) ptr++; + + if (((addr_t)ptr - (addr_t)vhdx) >= 512) + VIDDBG(0, "vhdx=%p, ptr=%p\n", vhdx, ptr); + + ASSERT(((addr_t)ptr - (addr_t)vhdx) < 512); + + ((vhd_xfer_t *)ptr)->mem = (void *)vhdx; + vhdx = (vhd_xfer_t *)ptr; + vhdx->fd = vf->fd; + vhdx->secmap_addr = sectormap_addr; + vhdx->sector_bit = sector_bit; + vhdx->num_secs = num_secs; + + if (VHD_CACHE_SZ > 0) { + //vhdx->cache = &vhd->cache[cache_index]; + vhdx->cache = cache; + vhdx->first_sector = first_sector; + if (cache) // Flush old cache entry + cache->first_sector = VHD_INVALID_SECTOR; + } else + vhdx->first_sector = VHD_INVALID_SECTOR; + + *arg = vhdx; + + vhd->stats.sec_alloc++; + + } else { + // cache the map + if (VHD_CACHE_SZ > 0) { + if (cache && + (cache->first_sector != VHD_INVALID_SECTOR)) { + memcpy(cache->secmap_chunk, + vhd->secmap_chunk, 512); + cache->first_sector = first_sector; + } + } + } + + if (cache) + cache->phys_first_sector = __arch__swab32(vhd->bat[blockno]) + + (vhd->sectormap_sz >> 9); + + // Sector in the backing file + *sectorno = (__arch__swab32(vhd->bat[blockno])) + sector_in_block + + (vhd->sectormap_sz >> 9); + + + return (VID_BLOCK_MAPPED); +} + +void +vhd_close(struct vdisk_dev *vdisk) +{ + struct list_head *ptr, *tmp; + vd_file_t *vf; + vhd_file_t *vhd; + int err; + + if (vdisk == NULL) { + VIDDBG(0, "Invalid vdisk pointer\n"); + return; + } + + list_for_each_safe(ptr, tmp, &vdisk->vdf_head) { + + vf = list_entry(ptr, vd_file_t, vdf_list); + if (vf == NULL) { + VIDDBG(0, "Invalid vdisk file pointer\n"); + return; + } + + vhd = vf->vdf; + if (vhd) { + VIDDBG(10, "VHD Stats for %s: \n" + "\t accesses:\t%" PRId64 "\n" + "\t cache_hit:\t%" PRId64 "\n" + "\t block_alloc:\t%" PRId64 "\n" + "\t sec_alloc:\t%" PRId64 "\n" + "\t total IOs:\t%" PRId64 "\n" + "\t busy:\t%" PRId64 "\n" + "\t sync:\t%" PRId64 "\n" + "\t async:\t%" PRId64 "\n", + vf->name, + vhd->stats.access, + vhd->stats.cache_hit, + vhd->stats.block_alloc, + vhd->stats.sec_alloc, + vdisk->tot_io, + vdisk->busyio, + vdisk->syncio, + vdisk->asyncio); + + if (vhd->ftr_mem) + vdisk_free(vhd->ftr_mem); + if (vhd->dhdr_mem) + vdisk_free(vhd->dhdr_mem); + if (vhd->bat_mem) + vdisk_free(vhd->bat_mem); + if (vhd->sec_mem) + vdisk_free(vhd->sec_mem); + vdisk_free(vhd); + } + + list_del(&vf->vdf_list); + + err = vdisk_close(vf->fd); + if (err) + VIDDBG(0, "close(%s): %s\n", vf->name, strerror(errno)); + + vdisk_free(vf); + + if (list_empty(&vdisk->vdf_head)) + break; + } +} + + +int vhd_open(struct vdisk_dev *vdisk, char *filename) +{ + int ret = 0; + int err; + vd_file_t *vf, *child_vf = NULL; + char *f, *child = NULL; + vhd_file_t *vhd; + int rw; + + if (vdisk->flags & VDISK_RO) + rw = O_RDONLY; + else + rw = O_RDWR; + + f = (char *)filename; + + while (f != NULL) { // Read all file associated with this VD file + + vf = (vd_file_t *)vdisk_malloc(sizeof(vd_file_t)); + if (vf == NULL) { + VIDDBG(0, "Couldn't allocate vd_file structure\n"); + vdisk_free(vdisk); + return (ENOMEM); + } + memset(vf, 0, sizeof(vd_file_t)); + + if (strlen(f) > PATH_MAX) { + strncpy(vf->name, f, PATH_MAX-1); + vf->name[PATH_MAX] = 0; + } else + strcpy(vf->name, f); + + vf->fd = vdisk_open(f, rw, 0); + if (vf->fd < 0) { + VIDDBG(0, "Failed to open %s\n", f); + vdisk_free(vf); + vhd_close((void *)vdisk); + return (errno); + } + err = vhd_read_metadata(vdisk, vf); + if (err) { + VIDDBG(0, "Couldn't read metadata for %s\n", f); + vdisk_free(vf); + vhd_close((void *)vdisk); + return (err); + } + + + if (child_vf == NULL) { + vf->flags |= VDF_LEAF; + rw = O_RDONLY; // for next iteration + } + +#if 0 + // If this is a parent, verify paternity + if (!vhd_isfamily(vf, child_vf)) { + VIDDBG(0, "%s is not parent of %s\n", + f, child_vf); + } +#endif + + list_add_tail(&vf->vdf_list, &vdisk->vdf_head); + + vhd = (vhd_file_t *)(vf->vdf); + if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF ) { + int i; + + child = f; + child_vf = vf; + + for (i=0;i<8;i++) { + ple_t ple; + int fd; + + vhd_get_dhdr_ple(vhd->dhdr, &ple, i); + if (ple.code == VHD_DYN_PLE_ABS || + ple.code == VHD_DYN_PLE_REL ) { + f = vhd_get_parent_name(vf, &ple); + if (f == NULL) { + VIDDBG(0, "Can't locate parent " + "info for %s\n", f); + ret = EINVAL; + goto out; + } + + // stat would be better + fd = open(f, O_RDONLY); + if (fd < 0) { + if (errno == ENOENT || + errno == ELOOP || + errno == ENOTDIR || + errno == ENODEV || + errno == EFAULT) { + continue; + } else { + VIDDBG(0, "stat(%s): %s\n", + f, strerror(errno)); + ret = errno; + goto out; + } + } else { + (void)close(fd); + break; + } + } + } + } else + break; + } +out: + return ret; +} + +uint64_t +vhd_size(void *hdl) +{ + struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl; + return (vdisk->sz); +} + +int +vhd_get_geometry(void *hdl, int *cyls, int *heads, int *secs) +{ + struct vdisk_dev *vdisk = (struct vdisk_dev *)hdl; + vd_file_t *vf = NULL; + struct list_head *ptr; + vhd_file_t *vhd; + uint32_t geom; + + + // Assume that the last file (base) has all the info + list_for_each(ptr, &vdisk->vdf_head) + vf = list_entry(ptr, vd_file_t, vdf_list); + + if (!vf) { + VIDDBG(0, "Can't find base file\n"); + return (-1); + } + + vhd = (vhd_file_t *)vf->vdf; + if (vhd == NULL) { + VIDDBG(0, "Can't find VHD data\n"); + return (-1); + } + geom = vhd_get_ftr_geom(vhd->ftr); + + *cyls = (geom >> 16) & 0xffff; + *heads = (geom >> 8) & 0xff; + *secs = geom & 0xff; + + VIDDBG(10, "geom = 0x%x (0x%x 0x%x 0x%x)\n", geom, *cyls, *heads, *secs); + + return (0); +} + +vdf_data_t vdfd_vhd = { + VHD_EXTENSION, + vhd_open, + vhd_close, + vhd_map_block, + vhd_xfer_commit, + vhd_print_header, + vhd_parse_args, + vhd_create_vdisk, + vhd_modify_vdisk, + {NULL,NULL}, +}; + +void +vhd_init() +{ + vhd_zeroes = __vhd_zeroes; + while ((addr_t)vhd_zeroes & 511) vhd_zeroes++; + + vdisk_register(&vdfd_vhd); + memset(vhd_zeroes, 0, VHD_FTR_SZ); +} + +void +vhd_exit() +{ + vdisk_unregister(&vdfd_vhd); +} diff -r 75c61490cc06 tools/vdisk/vhd.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vhd.h Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,107 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#ifndef __VHD_H +#define __VHD_H + +#define VHD_EXTENSION "vhd" + +#define VHD_FTR_SZ (512) +#define VHD_DHDR_SZ (1024) + +#define VHD_BAT_INVALID_ENTRY (0xffffffff) + +#define VHD_CACHE_SZ (16) +#define VHD_INVALID_SECTOR (0xffffffff) + +//Parent locator entry +typedef struct ple { + uint32_t code; + uint32_t data_space; + uint32_t data_len; + uint32_t reserved; // XXX: do we care? + uint64_t data_off; +} ple_t; + +typedef struct vhd_cache { + char *secmap_chunk; // 512b chunk of block's sectormap + char *sec_mem; + uint32_t first_sector; // First sector of the sectormap + uint32_t phys_first_sector; // Sector in the file that first_sector + // maps to + struct vhd_cache *next, *prev; +} vhd_cache_t; + +typedef struct vhd_stat { + uint64_t access; + uint64_t cache_hit; + uint64_t block_alloc; + uint64_t sec_alloc; +} vhd_stat_t; + +typedef struct vhd_xfer { + // sectormap *must* be first member! + char secmap_chunk[512]; // 512b chunk of sectormap. + off_t secmap_addr; // Address of the chunk + int sector_bit; // bit to be set in sectormap chunk + int num_secs; + vhd_cache_t *cache; + int first_sector; + file_t fd; + void *mem; // memory for vhd_xfer +} vhd_xfer_t; + +typedef struct vhd_file { + char *secmap_chunk; // 512B-aligned block of sectormap. + char *sec_mem; // memory for sectormap section + char *ftr; // 512B-aligned footer + char *ftr_mem; // memory for footer + char *dhdr; // 512B-aligned dynamic header + char *dhdr_mem; // memory for dynamic header + uint32_t *bat; // 512B-aligned Block Allocation Table + uint32_t *bat_mem; // memory for BAT + vhd_cache_t cache[VHD_CACHE_SZ]; + vhd_cache_t *cache_head; + vhd_cache_t *cache_tail; + vhd_stat_t stats; + int sec_per_block_log; + int sectormap_sz; + off_t next_block_off; +} vhd_file_t; + + +#define VHD_ARG_SZ (1<<0) +#define VHD_ARG_TYPE (1<<1) +#define VHD_ARG_BLOCKSZ (1<<2) +#define VHD_ARG_UUID (1<<3) +#define VHD_ARG_TIME (1<<4) +#define VHD_ARG_PARENT (1<<5) + + +typedef struct vhd_args { + size_t vhd_sz; + uint8_t type; + size_t blocksz; + uint8_t uuid[16]; + char *parent; + uint64_t args_mask; +} vhd_args_t; + + +extern vdf_data_t vdfd_vhd; +extern char *vhd_zeroes; // Just a bunch of zeroes + +extern int vhd_print_header(vd_file_t *vf); +extern int vhd_parse_args(int argc, int operations, char *argv[], void **args); +extern int vhd_create_vdisk(char *filename, void *args); +extern int vhd_modify_vdisk(struct vdisk_dev *vdisk, void *args); +extern char *vhd_get_parent_name(vd_file_t *vf, ple_t *ple); +extern void vhd_init(void); +extern void vhd_exit(void); + +#endif /* __VHD_H */ diff -r 75c61490cc06 tools/vdisk/vhd_footer.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vhd_footer.h Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,316 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#ifndef __VHD_FOOTER_H +#define __VHD_FOOTER_H + +#include <string.h> +#include <linux/types.h> +#include <linux/byteorder/swab.h> + +#define VHD_COOKIE (uint64_t) ( (uint64_t)'c' \ + | ((uint64_t)'o'<<(8*1)) \ + | ((uint64_t)'n'<<(8*2)) \ + | ((uint64_t)'e'<<(8*3)) \ + | ((uint64_t)'c'<<(8*4)) \ + | ((uint64_t)'t'<<(8*5)) \ + | ((uint64_t)'i'<<(8*6)) \ + | ((uint64_t)'x'<<(8*7))) + +#define VHD_FEATURES_NONE (0) +#define VHD_FEATURES_TEMP (1) +#define VHD_FEATURES_RSVD (2) + +#define VHD_FORMAT_VER_1 (0x00010000) + +/* data offset for fixed disks */ +#define VHD_FIXED_OFFSET ((uint64_t)-1) + +#define VHD_CREATOR_APP ((uint32_t)'v' \ + | ((uint32_t)'i'<<8) \ + | ((uint32_t)'t'<<16) \ + | ((uint32_t)'l'<<24)) +#define VHD_CREATOR_VER_1 (0x00010000) + +#define VHD_CREATOR_HOST_OS ((uint32_t)'L' \ + | ((uint32_t)'i'<<8) \ + | ((uint32_t)'n'<<16) \ + | ((uint32_t)'x'<<24)) + +#define VHD_TYPE_NONE (0) +#define VHD_TYPE_FIXED (2) +#define VHD_TYPE_DYNAMIC (3) +#define VHD_TYPE_DIFF (4) + + + +#define VHD_GEOM(c,h,s) { \ + ASSERT((c<=0xffff) && (h<=0xff) && (s<=0xff)) ; \ + (s | (h<<8) | (c<<16)); } + + +static inline uint64_t vhd_get_ftr_cookie(char *ftr) { + uint64_t tmp = *(uint64_t *)(&ftr[0]); + return (tmp); +} +static inline void vhd_set_ftr_cookie(char *ftr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&ftr[0]) = tmp; +} + +static inline uint32_t vhd_get_ftr_features(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[8]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_features(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[8]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_ftr_fformat(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[12]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_fformat(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[12]) = __arch__swab32(tmp); +} + +static inline uint64_t vhd_get_ftr_dataoff(char *ftr) { + uint64_t tmp = *(uint64_t *)(&ftr[16]); + return __arch__swab64(tmp); +} +static inline void vhd_set_ftr_dataoff(char *ftr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&ftr[16]) = __arch__swab64(tmp); +} + +static inline uint32_t vhd_get_ftr_timestamp(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[24]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_timestamp(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[24]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_ftr_cr_app(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[28]); + return (tmp); +} +static inline void vhd_set_ftr_cr_app(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[28]) = tmp; +} + +static inline uint32_t vhd_get_ftr_cr_ver(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[32]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_cr_ver(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[32]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_ftr_cr_hostos(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[36]); + return (tmp); +} +static inline void vhd_set_ftr_cr_hostos(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[36]) = tmp; +} + +static inline uint64_t vhd_get_ftr_orig_sz(char *ftr) { + uint64_t tmp = *(uint64_t *)(&ftr[40]); + return __arch__swab64(tmp); +} +static inline void vhd_set_ftr_orig_sz(char *ftr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&ftr[40]) = __arch__swab64(tmp); +} + +static inline uint64_t vhd_get_ftr_cur_sz(char *ftr) { + uint64_t tmp = *(uint64_t *)(&ftr[48]); + return __arch__swab64(tmp); +} +static inline void vhd_set_ftr_cur_sz(char *ftr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&ftr[48]) = __arch__swab64(tmp); +} + +static inline uint32_t vhd_get_ftr_geom(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[56]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_geom(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[56]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_ftr_type(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[60]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_type(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[60]) = __arch__swab32(tmp); +} + +#define VHD_FTR_CHKSUM_OFF (64) +static inline uint32_t vhd_get_ftr_chksum(char *ftr) { + uint32_t tmp = *(uint32_t *)(&ftr[64]); + return __arch__swab32(tmp); +} +static inline void vhd_set_ftr_chksum(char *ftr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&ftr[64]) = __arch__swab32(tmp); +} + +static inline uint8_t *vhd_get_ftr_uid(char *ftr) { + return (uint8_t *)&ftr[68]; +} +static inline void vhd_set_ftr_uid(char *ftr, uint8_t *val) { + memcpy(&ftr[68], val, 16); +} + +static inline uint8_t vhd_get_ftr_saved_state(char *ftr) { + uint8_t tmp = *(uint8_t *)(&ftr[84]); + return (tmp); +} +static inline void vhd_set_ftr_saved_state(char *ftr, uint8_t val) { + uint8_t tmp = val; + *(uint8_t *)(&ftr[84]) = tmp; +} + + + +#define VHD_DYN_COOKIE (uint64_t) ( (uint64_t)'c' \ + | ((uint64_t)'x'<<(8*1)) \ + | ((uint64_t)'s'<<(8*2)) \ + | ((uint64_t)'p'<<(8*3)) \ + | ((uint64_t)'a'<<(8*4)) \ + | ((uint64_t)'r'<<(8*5)) \ + | ((uint64_t)'s'<<(8*6)) \ + | ((uint64_t)'e'<<(8*7))) + +#define VHD_DYN_OFFSET ((uint64_t)-1) +#define VHD_DYN_HDR_VER_1 (0x00010000) + +// Parent locator codes (our own) +#define VHD_DYN_PLE_ABS ( (uint64_t)'u' \ + | ((uint64_t)'x'<<(8*1)) \ + | ((uint64_t)'n'<<(8*2)) \ + | ((uint64_t)'L'<<(8*3))) +#define VHD_DYN_PLE_REL ( (uint64_t)'k' \ + | ((uint64_t)'x'<<(8*1)) \ + | ((uint64_t)'n'<<(8*2)) \ + | ((uint64_t)'L'<<(8*3))) + + +static inline uint64_t vhd_get_dhdr_cookie(char *hdr) { + uint64_t tmp = *(uint64_t *)(&hdr[0]); + return (tmp); +} +static inline void vhd_set_dhdr_cookie(char *hdr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&hdr[0]) = tmp; +} + +static inline uint64_t vhd_get_dhdr_dataoff(char *hdr) { + uint64_t tmp = *(uint64_t *)(&hdr[8]); + return __arch__swab64(tmp); +} +static inline void vhd_set_dhdr_dataoff(char *hdr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&hdr[8]) = __arch__swab64(tmp); +} + +static inline uint64_t vhd_get_dhdr_tbloff(char *hdr) { + uint64_t tmp = *(uint64_t *)(&hdr[16]); + return __arch__swab64(tmp); +} +static inline void vhd_set_dhdr_tbloff(char *hdr, uint64_t val) { + uint64_t tmp = val; + *(uint64_t *)(&hdr[16]) = __arch__swab64(tmp); +} + +static inline uint32_t vhd_get_dhdr_hdrver(char *hdr) { + uint32_t tmp = *(uint32_t *)(&hdr[24]); + return __arch__swab32(tmp); +} +static inline void vhd_set_dhdr_hdrver(char *hdr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&hdr[24]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_dhdr_tbl_entries(char *hdr) { + uint32_t tmp = *(uint32_t *)(&hdr[28]); + return __arch__swab32(tmp); +} +static inline void vhd_set_dhdr_tbl_entries(char *hdr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&hdr[28]) = __arch__swab32(tmp); +} + +static inline uint32_t vhd_get_dhdr_blksz(char *hdr) { + uint32_t tmp = *(uint32_t *)(&hdr[32]); + return __arch__swab32(tmp); +} +static inline void vhd_set_dhdr_blksz(char *hdr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&hdr[32]) = __arch__swab32(tmp); +} + +#define VHD_DHDR_CHKSUM_OFF (36) +static inline uint32_t vhd_get_dhdr_chksum(char *hdr) { + uint32_t tmp = *(uint32_t *)(&hdr[36]); + return __arch__swab32(tmp); +} +static inline void vhd_set_dhdr_chksum(char *hdr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&hdr[36]) = __arch__swab32(tmp); +} + +static inline uint8_t *vhd_get_dhdr_puid(char *hdr) { + return (uint8_t *)&hdr[40]; +} +static inline void vhd_set_dhdr_puid(char *hdr, uint8_t *val) { + memcpy(&hdr[40], val, 16); +} + +static inline uint32_t vhd_get_dhdr_ptimestamp(char *hdr) { + uint32_t tmp = *(uint32_t *)(&hdr[56]); + return __arch__swab32(tmp); +} +static inline void vhd_set_dhdr_ptimestamp(char *hdr, uint32_t val) { + uint32_t tmp = val; + *(uint32_t *)(&hdr[56]) = __arch__swab32(tmp); +} + +static inline void vhd_get_dhdr_ple(char *hdr, ple_t *ple, int idx) { + char *tmp = &hdr[576+24*idx]; + + ple->code = __arch__swab32(*(uint32_t *)tmp); + ple->data_space = __arch__swab32(*(uint32_t *)(tmp+4)); + ple->data_len = __arch__swab32(*(uint32_t *)(tmp+8)); + ple->data_off = __arch__swab64(*(uint64_t *)(tmp+16)); +} + +static inline void vhd_set_dhdr_ple(char *hdr, ple_t *ple, int idx) { + char *tmp = &hdr[576+24*idx]; + + *(uint32_t *)(tmp) = __arch__swab32(ple->code); + *(uint32_t *)(tmp+4) = __arch__swab32(ple->data_space); + *(uint32_t *)(tmp+8) = __arch__swab32(ple->data_len); + *(uint64_t *)(tmp+16) = __arch__swab64(ple->data_off); +} + + + +#endif /* __VHD_FOOTER_H */ diff -r 75c61490cc06 tools/vdisk/vhd_utils.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/vdisk/vhd_utils.c Thu Jun 21 13:05:31 2007 -0400 @@ -0,0 +1,964 @@ +// Copyright (c) 2003-2007, Virtual Iron Software, Inc. +// +// Portions have been modified by Virtual Iron Software, Inc. +// (c) 2007. This file and the modifications can be redistributed and/or +// modified under the terms and conditions of the GNU General Public +// License, version 2.1 and not any later version of the GPL, as published +// by the Free Software Foundation. + +#include <stdio.h> +#include <stdlib.h> +#include <linux/stddef.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <errno.h> +#include <time.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> + +#include "vdisk.h" +#include "vdisk_utils.h" +#include "vhd.h" +#include "vhd_footer.h" + + +// Taken from Microsoft's VHD spec (hence notations...) +static uint32_t +vhd_chs(ssize_t sz) +{ + uint32_t totalSectors = (sz >> 9); // Assume sz in whole #sectors + int sectorsPerTrack, heads, cylinderTimesHeads, cylinders; + + + if (totalSectors > 65535 * 16 * 255) + totalSectors = 65535 * 16 * 255; + + if (totalSectors >= 65535 * 16 * 63) { + sectorsPerTrack = 255; + heads = 16; + cylinderTimesHeads = totalSectors / sectorsPerTrack; + } else { + sectorsPerTrack = 17; + cylinderTimesHeads = totalSectors / sectorsPerTrack; + + heads = (cylinderTimesHeads + 1023) / 1024; + + if (heads < 4) + heads = 4; + + if (cylinderTimesHeads >= (heads * 1024) || heads > 16) { + sectorsPerTrack = 31; + heads = 16; + cylinderTimesHeads = totalSectors / sectorsPerTrack; + } + + if (cylinderTimesHeads >= (heads * 1024)) { + sectorsPerTrack = 63; + heads = 16; + cylinderTimesHeads = totalSectors / sectorsPerTrack; + } + } + cylinders = cylinderTimesHeads / heads; + + return (VHD_GEOM(cylinders, heads, sectorsPerTrack)); +} + +uint32_t +vhd_chksum(char *ptr, size_t sz, char *excl) +{ + uint32_t chksum = 0; + int i; + + if (ptr == NULL) + return (0); + + for (i=0; i<sz; i++) + chksum += (uint8_t)ptr[i]; + + if (excl != NULL) { + // Subtract 4 bytes of checksum + chksum -= (excl[0] + excl[1] + excl[2] + excl[3]); + } + + return (~chksum); +} + + +static char * +vhd_time(uint32_t *file_time, uint32_t *cur_time, int f2c) +{ + time_t tm, tm_1970, tm_2000; + char *timestr; + struct tm epoch_1970 = {0,0,0,1,0,70,0,0,0}; + struct tm epoch_2000 = {0,0,0,1,0,100,0,0,0}; + + + tm_1970 = mktime(&epoch_1970); + tm_2000 = mktime(&epoch_2000); + + if (f2c) { + if (file_time == NULL) { + VIDDBG(0, "Invalid time\n"); + if (cur_time != NULL) + *cur_time = 0; + return NULL; + } + + tm = *file_time + (tm_2000-tm_1970); + timestr = ctime(&tm); + if (timestr == NULL) { + VIDDBG(0, "Couldn't convert time (0x%x)\n", *file_time); + return NULL; + } + if (cur_time != NULL) + *cur_time = tm; + timestr[strlen(timestr)-1] = '\0'; + } else { + if (cur_time == NULL) { + VIDDBG(0, "Invalid time\n"); + if (file_time != NULL) + *file_time = 0; + return NULL; + } + tm = *cur_time - (tm_2000-tm_1970); + timestr = ctime(&tm); + if (timestr == NULL) { + VIDDBG(0, "Couldn't convert time (0x%x)\n", *cur_time); + return NULL; + } + if (file_time != NULL) + *file_time = tm; + timestr[strlen(timestr)-1] = '\0'; + } + + return timestr; +} + + +char * +vhd_get_parent_name(vd_file_t *vf, ple_t *ple) +{ + char *cp, *buf, *pool; + int bytes; + + if ((ple == NULL) || (ple->data_len == 0)) { + VIDDBG(0, "Invalid data\n"); + return (NULL); + } + + // The file is opened with O_DIRECT, so we need to + // align buffer on 512-byte boundary + pool = buf = vdisk_malloc(ple->data_space+512); + if (buf == NULL) { + perror("malloc"); + return (NULL); + } + while((addr_t)buf & 511) buf++; + + if (lseek(vf->fd, ple->data_off, SEEK_SET) != ple->data_off) { + perror("lseek"); + vdisk_free(pool); + return NULL; + } + + bytes = read(vf->fd, buf, (size_t)ple->data_space); + if (bytes != ple->data_space) { + perror("read"); + vdisk_free(pool); + VIDDBG(0, "fd = %d\n", vf->fd); + return NULL; + } + + cp = vdisk_malloc(ple->data_len+1); + if (cp == NULL) { + perror("malloc"); + vdisk_free(pool); + return (NULL); + } + + strncpy(cp, buf, ple->data_len); + buf[ple->data_len] = 0; + vdisk_free(pool); + + //XXX: for codes W2Ru and W2ku we need to convert from UTF-16 to ASCII + return cp; +} + +int +vhd_print_header(vd_file_t *vf) +{ + char *cp; + uint64_t v64; + uint32_t v32; + vhd_file_t *vhd = (vhd_file_t *)(vf->vdf); + size_t sz, max_sz; + int i; + int err; + + // Figure out max file size + + err = vdisk_size(vf->fd, &sz); + if (err) { + VIDDBG(0, "Couldn't get file size\n"); + return (err); + } + + if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_FIXED) + max_sz = sz; + else { + uint64_t unmapped_blocks = 0; + size_t new_bytes; + + // Count blocks that haven't been allocated + for (i=0; i< vhd_get_dhdr_tbl_entries(vhd->dhdr); i++) + if (__arch__swab32(vhd->bat[i]) == + VHD_BAT_INVALID_ENTRY) + unmapped_blocks++; + + // XXX: Assume that block size is in 512-byte chunks + new_bytes = unmapped_blocks * (vhd->sectormap_sz + + vhd_get_dhdr_blksz(vhd->dhdr)); + max_sz = sz + new_bytes; + } + + printf("FILE %s:\n", vf->name); + printf("\tMaximum file size:\t0x%016zx\n\n", max_sz); + + + v64 = vhd_get_ftr_cookie(vhd->ftr); + cp = (char *)&v64; + printf("\tCookie:\t\t\t0x%016" PRIx64 " (\"%c%c%c%c%c%c%c%c\")\n", v64, + cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], cp[7]); + + printf("\tFeatures:\t\t0x%08x\n", vhd_get_ftr_features(vhd->ftr)); + printf("\tFile format vervion:\t0x%08x\n", vhd_get_ftr_fformat(vhd->ftr)); + printf("\tData Offset:\t\t0x%016" PRIx64 "\n", + vhd_get_ftr_dataoff(vhd->ftr)); + + v32 = vhd_get_ftr_timestamp(vhd->ftr); + printf("\ttimestamp:\t\t0x%08x (%s)\n", v32, vhd_time(&v32, NULL, 1)); + + printf("\tCreator App:\t\t0x%08x\n", vhd_get_ftr_cr_app(vhd->ftr)); + printf("\tCreator Ver:\t\t0x%08x\n", vhd_get_ftr_cr_ver(vhd->ftr)); + printf("\tCreator Host OS:\t0x%08x\n", vhd_get_ftr_cr_hostos(vhd->ftr)); + printf("\tOriginal size:\t\t0x%016" PRIx64 "\n", + vhd_get_ftr_orig_sz(vhd->ftr)); + printf("\tCurrent size:\t\t0x%016" PRIx64 "\n", + vhd_get_ftr_cur_sz(vhd->ftr)); + printf("\tGeometry:\t\t0x%08x\n", vhd_get_ftr_geom(vhd->ftr)); + printf("\tType:\t\t\t0x%08x\n", vhd_get_ftr_type(vhd->ftr)); + printf("\tChecksum:\t\t0x%08x\n", vhd_get_ftr_chksum(vhd->ftr)); + + printf("\tUnique ID:\t\t"); + cp = (char *)vhd_get_ftr_uid(vhd->ftr); + for (i=0;i<16;i++) + printf("%02x", (*cp++) & 0xff); + + printf("\n\tSaved state:\t\t0x%08x\n", vhd_get_ftr_saved_state(vhd->ftr)); + if ((vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DYNAMIC ) || + (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF )) { + + printf(" Dynamic Header:\n"); + + v64 = vhd_get_dhdr_cookie(vhd->dhdr); + cp = (char *)&v64; + printf("\t Cookie:\t\t0x%016" PRIx64 " (\"%c%c%c%c%c%c%c%c\")\n", + v64, cp[0], cp[1], cp[2], cp[3], cp[4], cp[5], cp[6], cp[7]); + printf("\t Data Offset:\t\t0x%016" PRIx64 "\n", + vhd_get_dhdr_dataoff(vhd->dhdr)); + printf("\t Table Offset:\t\t0x%016" PRIx64 "\n", + vhd_get_dhdr_tbloff(vhd->dhdr)); + printf("\t Max Table Entries:\t0x%08x\n", + vhd_get_dhdr_tbl_entries(vhd->dhdr)); + printf("\t Block Size:\t\t0x%08x\n", + vhd_get_dhdr_blksz(vhd->dhdr)); + printf("\t Checksum:\t\t0x%08x\n", + vhd_get_dhdr_chksum(vhd->dhdr)); + } + + if (vhd_get_ftr_type(vhd->ftr) == VHD_TYPE_DIFF ) { + + printf("\t Parent Unique ID:\t"); + cp = (char *)vhd_get_dhdr_puid(vhd->dhdr); + for (i=0;i<16;i++) + printf("%02x", (*cp++) & 0xff); + v32 = vhd_get_dhdr_ptimestamp(vhd->dhdr); + printf("\n\t Parent Timestamp:\t0x%08x (%s)\n", + v32, vhd_time(&v32, NULL, 1)); + + for (i=0;i<8;i++) { + ple_t ple; + + vhd_get_dhdr_ple(vhd->dhdr, &ple, i); + if (ple.code != 0) { + printf("\t Parent Locator Entry %d:\n", i); + + cp = (char *)&ple.code; + printf("\t\tPlatform Code:\t0x%08x " + "(\"%c%c%c%c\")\n", + ple.code, cp[3], cp[2], cp[1], cp[0]); + printf("\t\tData Space:\t0x%08x\n", + ple.data_space); + printf("\t\tData Length:\t0x%08x\n", + ple.data_len); + printf("\t\tData Offset:\t0x%016" PRIx64" \n", + ple.data_off); + cp = vhd_get_parent_name(vf, &ple); + if (cp == NULL) { + VIDDBG(0, "Can't locate parent info " + "in file\n"); + continue; + } + printf("\t\tParent Locator:\t%s\n", cp); + free(cp); + } + } + } + + return (0); +} + +int +vhd_parse_args(int argc, int operations, char *argv[], void **args) +{ + char c; + int i; + extern char *optarg; + extern int optind, opterr, optopt; + vhd_args_t *vhd_args; + + void vhd_usage() { + fprintf(stderr, "VHD-specific options: " + "-S <size(MB)> [-f|-d [-p <parent>]] [-B <size(B)>]" + " [-u UUID] [-t]\n"); + } + + vhd_args = malloc(sizeof(vhd_args_t)); + if (vhd_args == NULL) { + VIDDBG(0, "Can't allocate arguments\n"); + return (-1); + } + + memset(vhd_args, 0, sizeof(vhd_args_t)); + vhd_args->type = VHD_TYPE_NONE; + vhd_args->blocksz = 0x200000; // 2MB + + while (1) { + + c = getopt(argc, argv, "S:fdstB:u:p:"); + if (c == -1) + break; + + switch (c) { + case 'f': + vhd_args->type = VHD_TYPE_FIXED; + vhd_args->args_mask |= VHD_ARG_TYPE; + break; + case 's': // 's' for "sparse" + VIDDBG(0, "'-s' option is obsolete. Use '-d' instead\n"); + case 'd': + vhd_args->type = VHD_TYPE_DYNAMIC; + vhd_args->args_mask |= VHD_ARG_TYPE; + break; + case 'S': + vhd_args->vhd_sz = atol(optarg) * 1024 * 1024; + vhd_args->args_mask |= VHD_ARG_SZ; + break; + case 't': + vhd_args->args_mask |= VHD_ARG_TIME; + break; + case 'p': + vhd_args->args_mask |= VHD_ARG_PARENT; + vhd_args->parent = malloc(strlen(optarg)+1); + if (vhd_args->parent == NULL) { + VIDDBG(0, "Out of memory\n"); + goto fail; + } + strncpy(vhd_args->parent, optarg, strlen(optarg)+1); + break; + case 'B': + vhd_args->blocksz = atol(optarg); + // Must be in 512 byte chunks + if (vhd_args->blocksz & 511) { + VIDDBG(0, + "block size must be divisible by 512\n"); + goto fail; + } + vhd_args->args_mask |= VHD_ARG_BLOCKSZ; + break; + case 'u': + if ((optarg == NULL) || (strlen(optarg) != 32)) { + VIDDBG(0, "UUID is a 16-byte (32-character)" + " string\n"); + goto fail; + } + + // Convert UUID characters to hex + for(i=0;i<32;i++) { + uint8_t val; + + val = optarg[i]; + if (!isxdigit(val)) { + VIDDBG(0, "Invalid character in UUID " + "string ('%c')\n", optarg[i]); + free(vhd_args); + vhd_usage(); + return (-1); + } + if (isalpha(val)) { + val = tolower(val); + val -= ('a' - 0xa); + } else + val -= '0'; + + // two hex numbers per byte + vhd_args->uuid[i>>1] |= (val << (4*((i&1)^1))); + } + vhd_args->args_mask |= VHD_ARG_UUID; + break; + default: + vhd_usage(); + goto fail; + } + } + + if ((vhd_args->parent != NULL) && (vhd_args->type == VHD_TYPE_FIXED)) { + VIDDBG(0, "Fixed VHD cannot have a parent\n"); + goto fail; + } + + if (operations & VDISK_OP_CREATE) { + if (vhd_args->parent == NULL) { + if ((vhd_args->vhd_sz == 0) || + (vhd_args->type == VHD_TYPE_NONE)) + goto fail; + } + } + + if (vhd_args->vhd_sz % vhd_args->blocksz) { + VIDDBG(0, "File size must be multiple of block size\n"); + goto fail; + } + + if (operations & VDISK_OP_MODIFY) { + if (vhd_args->args_mask & VHD_ARG_PARENT) { + if (vhd_args->type == VHD_TYPE_FIXED) { + VIDDBG(0, "Fixed VHDs can't have parents\n"); + goto fail; + } + } + if (vhd_args->args_mask & (VHD_ARG_SZ | VHD_ARG_BLOCKSZ)) { + VIDDBG(0, "Can't modify VHD's size or block size\n"); + goto fail; + } + } + + if (vhd_args->parent != NULL) { + vhd_args->type = VHD_TYPE_DIFF; + if (vhd_args->args_mask & (VHD_ARG_SZ | VHD_ARG_BLOCKSZ)) { + VIDDBG(0, "Differencing VHD's size and block size " + "are inherited from parent\n"); + goto fail; + } + } + + + *args = vhd_args; + return (0); + +fail: + if (vhd_args->parent) + free(vhd_args->parent); + free(vhd_args); + vhd_usage(); + return (-1); +} + +// Store differencing file's parent information +static int +vhd_store_parent(int vfd, vhd_file_t *vhd, vhd_file_t *pvhd, + char *parentname, loff_t *data) +{ + uint32_t bat_sz; + ple_t ple; + int i; + int err; + size_t bytes; + + vhd_set_dhdr_puid(vhd->dhdr, vhd_get_ftr_uid(pvhd->ftr)); + vhd_set_dhdr_ptimestamp(vhd->dhdr, + vhd_get_ftr_timestamp(pvhd->ftr)); + + memset(&ple, 0, sizeof(ple_t)); + for (i=0;i<8;i++) + vhd_set_dhdr_ple(vhd->dhdr, &ple, i); + + if (parentname[0] == '/') + ple.code = VHD_DYN_PLE_ABS; + else + ple.code = VHD_DYN_PLE_REL; + + // XXX: The spec says this is number of 512b sectors, + // but file created by MS's Virtual PC tool seems to + // think this is number of bytes, aligned at 512b + ple.data_space = (strlen(parentname) + 1 + 512) + & (~511); + ple.data_len = strlen(parentname) + 1; + + bat_sz = vhd_get_dhdr_tbl_entries(vhd->dhdr) << 2; + + ple.data_off = VHD_DHDR_SZ + VHD_FTR_SZ + + bat_sz + + ((bat_sz & 511) ? (512-(bat_sz&511)) : 0) + + 512; // XXX: see comment in vhd_create_vdisk() + vhd_set_dhdr_ple(vhd->dhdr, &ple, 0); + + // Recalculate checksum + vhd_set_dhdr_chksum(vhd->dhdr, + vhd_chksum(vhd->dhdr, VHD_DHDR_SZ, + &vhd->dhdr[VHD_DHDR_CHKSUM_OFF])); + + if (lseek(vfd, VHD_FTR_SZ, SEEK_SET) != VHD_FTR_SZ) { + err = errno; + VIDDBG(0, "lseek: %s", strerror(err)); + return (err); + } + + // Write the dynamic header + bytes = write(vfd, vhd->dhdr, VHD_DHDR_SZ); + if (bytes != VHD_DHDR_SZ) { + err = errno; + VIDDBG(0, "write: %s", strerror(err)); + return (err); + } + + // Write PLE + bytes = lseek(vfd, ple.data_off, SEEK_SET); + if (bytes != ple.data_off) { + err = errno; + VIDDBG(0, "lseek: %s", strerror(err)); + return (err); + } + bytes = write(vfd, parentname, strlen(parentname)+1); + if (bytes != strlen(parentname)+1) { + err = errno; + VIDDBG(0, "write: %s", strerror(err)); + return (err); + } + + if (data != NULL) + *data = (loff_t)ple.data_off + (loff_t)ple.data_space; + + return (0); +} + + +int +vhd_modify_vdisk(struct vdisk_dev *vdisk, void *args) +{ + vhd_args_t *vhd_args = args; + vd_file_t *vf = NULL; + vhd_file_t *vhd; + size_t sz, bytes; + int err; + int store_footer = 0; + struct list_head *ptr; + int stop = 0; + + + // XXX: We always make a single pass + list_for_each(ptr, &vdisk->vdf_head) { + + vf = list_entry(ptr, vd_file_t, vdf_list); + if ((vf == NULL) || (vf->vdf == NULL)) { + VIDDBG(0, "Can't access vdisk's structures\n"); + return (-1); + } + vhd = (vhd_file_t *)vf->vdf; + + // Close and reopen file (it may have been open O_DIRECT) + err = vdisk_close(vf->fd); + if (err) { + VIDDBG(0, "Can't close %s:%d\n", vf->name, err); + return (err); + } + + vf->fd = open(vf->name, O_RDWR, 0644); + if (vf->fd == -1) { + err = errno; + VIDDBG(0, "Can't open %s:%d\n", vf->name, strerror(errno)); + return (err); + } + + // Update UUID + if (vhd_args->args_mask & VHD_ARG_UUID) { + + vhd_set_ftr_uid(vhd->ftr, vhd_args->uuid); + + store_footer = 1; + stop = 1; + } + + // Change parent name + if (vhd_args->args_mask & VHD_ARG_PARENT) { + vhd_file_t *pvhd; + struct vdisk_dev parent; + vd_file_t *pvf; + + // Open parent file + err = vdisk_init(&parent, vhd_args->parent, NULL, 0); + if (err) { + VIDDBG(0, "Failed to initialize state for " + "parent %s\n", vhd_args->parent); + return (err); + } + pvf = list_entry(parent.vdf_head.next, vd_file_t, vdf_list); + pvhd = (vhd_file_t *)pvf->vdf; + + // Update dynamic header and parent data + err = vhd_store_parent(vf->fd, vhd, pvhd, + vhd_args->parent, NULL); + if (err) { + VIDDBG(0, "Failed to store parent name (%s)\n", + vhd_args->parent); + vdisk_fini(&parent); + return (err); + } + vdisk_fini(&parent); + + store_footer = 1; + stop = 1; + } + + // Update timestamp + if (vhd_args->args_mask & VHD_ARG_TIME) { + uint32_t curtime, ftime; + + curtime = time(NULL); + if (curtime == -1) { + perror("time"); + return (errno); + } + (void)vhd_time(&ftime, &curtime, 0); + vhd_set_ftr_timestamp(vhd->ftr, ftime); + + stop = 1; + } + + // Recompute footer's checksum + vhd_set_ftr_chksum(vhd->ftr, + vhd_chksum(vhd->ftr, VHD_FTR_SZ, + &vhd->ftr[VHD_FTR_CHKSUM_OFF])); + + // Write the footer back if needed + if (store_footer) { + + err = vdisk_size(vf->fd, &sz); + if (err != 0) { + VIDDBG(0, "Can't determine vdisk's size\n"); + return (-1); + } + + if (lseek(vf->fd, (sz-VHD_FTR_SZ), SEEK_SET) != + (sz - VHD_FTR_SZ)) { + perror("lseek"); + return (errno); + } + bytes = write(vf->fd, vhd->ftr, VHD_FTR_SZ); + if (bytes != VHD_FTR_SZ) { + perror("write"); + return (errno); + } + + // For non-fixed disks write footer at front as well + if (vhd_get_ftr_type(vhd->ftr) != VHD_TYPE_FIXED) { + if (lseek(vf->fd, 0, SEEK_SET) != 0) { + perror("lseek"); + return (errno); + } + bytes = write(vf->fd, vhd->ftr, VHD_FTR_SZ); + if (bytes != VHD_FTR_SZ) { + perror("write"); + return (errno); + } + } + } + + if (stop) + break; + } + + if (fsync(vf->fd)) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + + return (0); +} + +int +vhd_create_vdisk(char *filename, void *args) +{ + vhd_args_t *vhd_args = args; + vhd_file_t vhd; + uint32_t curtime, ftime; + int vfd = -1; + ssize_t bytes; + int i; + int err = 0; + char *hdr_pool = NULL, *ftr_pool = NULL; + struct vdisk_dev parent; + + vfd = open(filename, O_CREAT|O_EXCL|O_RDWR, 0644); + if (vfd == -1) { + if (errno == EEXIST) { + size_t sz; + + // File already exists + if (vhd_args->type != VHD_TYPE_FIXED) { + VIDDBG(0, "Raw files can only be converted to " + "fixed VHD format\n"); + return (EINVAL); + } + + vfd = open(filename, O_RDWR, 0644); + if (vfd == -1) { + err = errno; + VIDDBG(0, "vfd open(%s, O_RDWR) failed: %s\n", + filename, strerror(err)); + return (err); + } + + err = vdisk_size(vfd, &sz); + if (err) { + VIDDBG(0, "vdisk_size(%s) failed: %s\n", + filename, strerror(err)); + return (err); + } + + if (vhd_args->vhd_sz < sz) { + VIDDBG(0, "WARNING: Truncating %s (%ld bytes) " + "to %ld bytes\n", + filename, sz, vhd_args->vhd_sz); + + err = ftruncate(vfd, vhd_args->vhd_sz); + if (err == -1) { + err = errno; + VIDDBG(0, "ftruncate(%s, %ld): %s\n", + filename, vhd_args->vhd_sz, + strerror(err)); + return (err); + } + } + } else { + err = errno; + VIDDBG(0, "vfd open(%s, O_CREAT|O_EXCL|O_RDWR) " + "failed: %s\n", filename, strerror(err)); + return (err); + } + } + + parent.vdfd = NULL; + + memset((char *)&vhd, 0, sizeof(vhd)); + ftr_pool = vhd.ftr = vdisk_malloc(VHD_FTR_SZ+512); + if (vhd.ftr == NULL) { + VIDDBG(0, "Couldn't allocate VHD footer\n"); + close(vfd); + return (ENOMEM); + } + while ((addr_t)vhd.ftr & 511) vhd.ftr++; + + vhd_set_ftr_cookie(vhd.ftr, VHD_COOKIE); + vhd_set_ftr_features(vhd.ftr, VHD_FEATURES_RSVD); + vhd_set_ftr_fformat(vhd.ftr, VHD_FORMAT_VER_1); + vhd_set_ftr_type(vhd.ftr, vhd_args->type); + + curtime = time(NULL); + if (curtime == -1) { + err = errno; + perror("time"); + goto out; + } + (void)vhd_time(&ftime, &curtime, 0); + vhd_set_ftr_timestamp(vhd.ftr, ftime); + + vhd_set_ftr_cr_app(vhd.ftr, VHD_CREATOR_APP); + vhd_set_ftr_cr_ver(vhd.ftr, VHD_CREATOR_VER_1); + vhd_set_ftr_cr_hostos(vhd.ftr, VHD_CREATOR_HOST_OS); + vhd_set_ftr_orig_sz(vhd.ftr, vhd_args->vhd_sz); + vhd_set_ftr_cur_sz(vhd.ftr, vhd_args->vhd_sz); + vhd_set_ftr_geom(vhd.ftr, vhd_chs(vhd_args->vhd_sz)); + + vhd_set_ftr_uid(vhd.ftr, vhd_args->uuid); + + if (vhd_args->type == VHD_TYPE_FIXED) + vhd_set_ftr_dataoff(vhd.ftr, VHD_FIXED_OFFSET); + else if ((vhd_args->type == VHD_TYPE_DYNAMIC) || + (vhd_args->type == VHD_TYPE_DIFF)) + vhd_set_ftr_dataoff(vhd.ftr, VHD_FTR_SZ); + else + ASSERT(0); + + vhd_set_ftr_chksum(vhd.ftr, vhd_chksum(vhd.ftr, VHD_FTR_SZ, + &vhd.ftr[VHD_FTR_CHKSUM_OFF])); + + // Create dynamic header + if ((vhd_args->type == VHD_TYPE_DYNAMIC) || + (vhd_args->type == VHD_TYPE_DIFF)) { + + uint32_t bat_entry, bat_sz; + loff_t data; + vhd_file_t *pvhd = NULL; + vd_file_t *pvf; + + if (vhd_args->type == VHD_TYPE_DIFF) { + // Read parent data + err = vdisk_init(&parent, vhd_args->parent, NULL, 0); + if (err) { + VIDDBG(0, "Failed to initialize state for " + "parent %s\n", vhd_args->parent); + return (err); + } + pvf = list_entry(parent.vdf_head.next, + vd_file_t, vdf_list); + pvhd = (vhd_file_t *)pvf->vdf; + + // Update footer fields inherited from parent + vhd_set_ftr_orig_sz(vhd.ftr, + vhd_get_ftr_orig_sz(pvhd->ftr)); + vhd_set_ftr_cur_sz(vhd.ftr, + vhd_get_ftr_cur_sz(pvhd->ftr)); + vhd_set_ftr_geom(vhd.ftr, + vhd_get_ftr_geom(pvhd->ftr)); + + vhd_args->vhd_sz = vhd_get_ftr_cur_sz(vhd.ftr); + } + + hdr_pool = vhd.dhdr = vdisk_malloc(VHD_DHDR_SZ+512); + if (vhd.dhdr == NULL) { + vdisk_free(ftr_pool); + VIDDBG(0, "Couldn't allocate dynamic header\n"); + err = ENOMEM; + goto out; + } + while ((addr_t)vhd.dhdr & 511) vhd.dhdr++; + + vhd_set_dhdr_cookie(vhd.dhdr, VHD_DYN_COOKIE); + vhd_set_dhdr_dataoff(vhd.dhdr, VHD_DYN_OFFSET); + vhd_set_dhdr_tbloff(vhd.dhdr, VHD_FTR_SZ+VHD_DHDR_SZ); + vhd_set_dhdr_hdrver(vhd.dhdr, VHD_DYN_HDR_VER_1); + vhd_set_dhdr_tbl_entries(vhd.dhdr, + vhd_args->vhd_sz/vhd_args->blocksz); + vhd_set_dhdr_blksz(vhd.dhdr, vhd_args->blocksz); + + vhd_set_dhdr_chksum(vhd.dhdr, + vhd_chksum(vhd.dhdr, VHD_DHDR_SZ, + &vhd.dhdr[VHD_DHDR_CHKSUM_OFF])); + + // Write the copy of the footer first + bytes = write(vfd, vhd.ftr, VHD_FTR_SZ); + if (bytes != VHD_FTR_SZ) { + perror("write"); + err = errno; + goto out; + } + + // Write the dynamic header + bytes = write(vfd, vhd.dhdr, VHD_DHDR_SZ); + if (bytes != VHD_DHDR_SZ) { + perror("write"); + vdisk_free(vhd.dhdr); + close(vfd); + return (errno); + } + + // Initialize BAT + // XXX: Make it faster perhaps? + bat_entry = VHD_BAT_INVALID_ENTRY; + for (i=0; i< vhd_get_dhdr_tbl_entries(vhd.dhdr); i++) { + bytes = write(vfd, &bat_entry, 4); + if (bytes != 4) { + err = errno; + perror("write"); + goto out; + } + } + + // BAT must end on sector boundary (512 bytes) + bat_entry = 0; + bat_sz = vhd_get_dhdr_tbl_entries(vhd.dhdr) << 2; + if (bat_sz & 511) { + for (i=0; i<512-(bat_sz&511);i++) { + // Write 1 byte at a time + bytes = write(vfd, &bat_entry, 1); + if (bytes != 1) { + perror("write"); + err = errno; + goto out; + } + } + } + + // XXX: It appears that there is a 512B block + // at the end of BAT, which is not mentioned in the spec + for (i=0; i<512>>2; i++) { + bytes = write(vfd, &bat_entry, 4); + if (bytes != 4) { + err = errno; + perror("write"); + goto out; + } + } + + if (vhd_args->type == VHD_TYPE_DIFF) { + // This will store dynamic header again, but that's OK + err = vhd_store_parent(vfd, &vhd, pvhd, + vhd_args->parent, &data); + if (err) { + VIDDBG(0, "Failed to store parent name (%s)\n", + vhd_args->parent); + return (err); + } + + bytes = lseek(vfd, data, SEEK_SET); + if (bytes != data) { + err = errno; + perror("lseek"); + goto out; + } + } + } else { + // for fixed disk, seek to the end of the file + if (lseek(vfd, vhd_args->vhd_sz, SEEK_SET) != + vhd_args->vhd_sz) { + perror("lseek"); + err = errno; + goto out; + } + } + + // Write footer. For fixed disks allocate whole filesize + bytes = write(vfd, vhd.ftr, VHD_FTR_SZ); + if (bytes != VHD_FTR_SZ) { + perror("write"); + err = errno; + goto out; + } + +out: + if (parent.vdfd != NULL) + vdisk_fini(&parent); + + if (ftr_pool) + vdisk_free(ftr_pool); + if (hdr_pool) + vdisk_free(hdr_pool); + if (vfd != -1) { + if (fsync(vfd)) + VIDDBG(0, "fsync: %s\n", strerror(errno)); + close(vfd); + } + + return (err); +} _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |