# HG changeset patch
# User emellor@xxxxxxxxxxxxxxxxxxxxxx
# Node ID 689da5e0a970ea5af48c9e8371fb446bc89e9387
# Parent 849723752858df693e189e23a2db2a3464950165
Added a basic integrity checker, and some basic ability to recover from store
corruption, rather than just spewing error messages and exiting.
Added a xenstore-control executable, which sends commands to xenstored.
Currently, the only command is 'check', which triggers an integrity check.
(The integrity check is also triggered whenever a corrupted store is detected).
Signed-off-by: Ewan Mellor <ewan@xxxxxxxxxxxxx>
diff -r 849723752858 -r 689da5e0a970 .hgignore
--- a/.hgignore Thu Mar 2 01:01:17 2006
+++ b/.hgignore Thu Mar 2 01:09:23 2006
@@ -166,6 +166,7 @@
^tools/xenstore/xenstore-read$
^tools/xenstore/xenstore-rm$
^tools/xenstore/xenstore-write$
+^tools/xenstore/xenstore-control$
^tools/xenstore/xenstore-ls$
^tools/xenstore/xenstored$
^tools/xenstore/xenstored_test$
diff -r 849723752858 -r 689da5e0a970 tools/xenstore/Makefile
--- a/tools/xenstore/Makefile Thu Mar 2 01:01:17 2006
+++ b/tools/xenstore/Makefile Thu Mar 2 01:09:23 2006
@@ -27,7 +27,10 @@
CLIENTS += xenstore-write
CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
-all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls
+all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control
xenstore-ls
+
+test_interleaved_transactions: test_interleaved_transactions.o
+ $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
testcode: xs_test xenstored_test xs_random
@@ -35,13 +38,16 @@
$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
$(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so
- $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+ $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
$(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
$(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
+xenstore-control: xenstore_control.o libxenstore.so
+ $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
+
xenstore-ls: xsls.o libxenstore.so
- $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+ $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
xenstored_test: xenstored_core_test.o xenstored_watch_test.o
xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o
fake_libxc.o utils.o tdb.o
$(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
@@ -77,7 +83,8 @@
clean: testsuite-clean
rm -f *.o *.opic *.so
rm -f xenstored xs_random xs_stress xs_crashme
- rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS)
+ rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls
+ rm -f $(CLIENTS)
$(RM) $(PROG_DEP)
print-dir:
@@ -129,7 +136,7 @@
tarball: clean
cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/
-install: libxenstore.so xenstored xenstore-ls $(CLIENTS)
+install: all
$(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored
$(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored
$(INSTALL_DIR) -p $(DESTDIR)/usr/bin
@@ -137,6 +144,7 @@
$(INSTALL_DIR) -p $(DESTDIR)/usr/include
$(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin
$(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin
+ $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin
$(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin
$(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
$(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR)
diff -r 849723752858 -r 689da5e0a970 tools/xenstore/xenstored_core.c
--- a/tools/xenstore/xenstored_core.c Thu Mar 2 01:01:17 2006
+++ b/tools/xenstore/xenstored_core.c Thu Mar 2 01:09:23 2006
@@ -60,6 +60,18 @@
static char *tracefile = NULL;
static TDB_CONTEXT *tdb_ctx;
+static void corrupt(struct connection *conn, const char *fmt, ...);
+static void check_store();
+
+#define log(...) \
+ do { \
+ char *s = talloc_asprintf(NULL, __VA_ARGS__); \
+ trace("%s\n", s); \
+ syslog(LOG_ERR, "%s", s); \
+ talloc_free(s); \
+ } while (0)
+
+
#ifdef TESTING
static bool failtest = false;
@@ -103,33 +115,6 @@
#endif /* TESTING */
#include "xenstored_test.h"
-
-/* FIXME: Ideally, this should never be called. Some can be eliminated. */
-/* Something is horribly wrong: shutdown immediately. */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
- const char *fmt, ...)
-{
- va_list arglist;
- char *str;
- int saved_errno = errno;
-
- va_start(arglist, fmt);
- str = talloc_vasprintf(NULL, fmt, arglist);
- va_end(arglist);
-
- trace("xenstored corruption: connection id %i: err %s: %s",
- conn ? (int)conn->id : -1, strerror(saved_errno), str);
- eprintf("xenstored corruption: connection id %i: err %s: %s",
- conn ? (int)conn->id : -1, strerror(saved_errno), str);
-#ifdef TESTING
- /* Allow them to attach debugger. */
- sleep(30);
-#endif
- syslog(LOG_DAEMON,
- "xenstored corruption: connection id %i: err %s: %s",
- conn ? (int)conn->id : -1, strerror(saved_errno), str);
- _exit(2);
-}
TDB_CONTEXT *tdb_context(struct connection *conn)
{
@@ -216,7 +201,8 @@
now = time(NULL);
tm = localtime(&now);
- trace("%s %p %02d:%02d:%02d %s (", prefix, conn,
+ trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn,
+ conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday,
tm->tm_hour, tm->tm_min, tm->tm_sec,
sockmsg_string(data->hdr.msg.type));
@@ -837,8 +823,6 @@
return 0;
}
-/* Be careful: create heirarchy, put entry in existing parent *last*.
- * This helps fsck if we die during this. */
static struct node *create_node(struct connection *conn,
const char *name,
void *data, unsigned int datalen)
@@ -939,8 +923,9 @@
{
unsigned int i;
- /* Delete self, then delete children. If something goes wrong,
- * consistency check will clean up this way. */
+ /* Delete self, then delete children. If we crash, then the worst
+ that can happen is the children will continue to take up space, but
+ will otherwise be unreachable. */
delete_node_single(conn, node);
/* Delete children, too. */
@@ -950,9 +935,14 @@
child = read_node(conn,
talloc_asprintf(node, "%s/%s", node->name,
node->children + i));
- if (!child)
- corrupt(conn, "No child '%s' found", child);
- delete_node(conn, child);
+ if (child) {
+ delete_node(conn, child);
+ }
+ else {
+ trace("delete_node: No child '%s/%s' found!\n",
+ node->name, node->children + i);
+ /* Skip it, we've already deleted the parent. */
+ }
}
}
@@ -976,12 +966,15 @@
}
}
corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
+ return false;
}
static int _rm(struct connection *conn, struct node *node, const char *name)
{
- /* Delete from parent first, then if something explodes fsck cleans. */
+ /* Delete from parent first, then if we crash, the worst that can
+ happen is the child will continue to take up space, but will
+ otherwise be unreachable. */
struct node *parent = read_node(conn, get_parent(name));
if (!parent) {
send_error(conn, EINVAL);
@@ -1000,10 +993,11 @@
static void internal_rm(const char *name)
{
- char *tname = talloc_strdup(talloc_autofree_context(), name);
+ char *tname = talloc_strdup(NULL, name);
struct node *node = read_node(NULL, tname);
if (node)
_rm(NULL, node, tname);
+ talloc_free(tname);
}
@@ -1149,18 +1143,19 @@
case XS_DEBUG:
if (streq(in->buffer, "print"))
xprintf("debug: %s", in->buffer + get_string(in, 0));
+ if (streq(in->buffer, "check"))
+ check_store();
#ifdef TESTING
/* For testing, we allow them to set id. */
if (streq(in->buffer, "setid")) {
conn->id = atoi(in->buffer + get_string(in, 0));
- send_ack(conn, XS_DEBUG);
} else if (streq(in->buffer, "failtest")) {
if (get_string(in, 0) < in->used)
srandom(atoi(in->buffer + get_string(in, 0)));
- send_ack(conn, XS_DEBUG);
failtest = true;
}
#endif /* TESTING */
+ send_ack(conn, XS_DEBUG);
break;
case XS_WATCH:
@@ -1258,7 +1253,7 @@
if (in->hdr.msg.len > PATH_MAX) {
#ifndef TESTING
- syslog(LOG_DAEMON, "Client tried to feed us %i",
+ syslog(LOG_ERR, "Client tried to feed us %i",
in->hdr.msg.len);
#endif
goto bad_client;
@@ -1425,10 +1420,16 @@
balloon driver will pick up stale entries. In the case of
the balloon driver, this can be fatal.
*/
- char *tlocal = talloc_strdup(talloc_autofree_context(),
- "/local");
+ char *tlocal = talloc_strdup(NULL, "/local");
+
+ check_store();
+
internal_rm("/local");
create_node(NULL, tlocal, NULL, 0);
+
+ talloc_free(tlocal);
+
+ check_store();
}
else {
tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
@@ -1439,10 +1440,92 @@
manual_node("/", "tool");
manual_node("/tool", "xenstored");
manual_node("/tool/xenstored", NULL);
- }
-
- /* FIXME: Fsck */
-}
+
+ check_store();
+ }
+}
+
+static char *child_name(const char *s1, const char *s2)
+{
+ if (strcmp(s1, "/")) {
+ return talloc_asprintf(NULL, "%s/%s", s1, s2);
+ }
+ else {
+ return talloc_asprintf(NULL, "/%s", s2);
+ }
+}
+
+static void check_store_(const char *name)
+{
+ struct node *node = read_node(NULL, name);
+
+ if (node) {
+ size_t i = 0;
+
+ while (i < node->childlen) {
+ size_t childlen = strlen(node->children + i);
+ char * childname = child_name(node->name,
+ node->children + i);
+ struct node *childnode = read_node(NULL, childname);
+
+ if (childnode) {
+ check_store_(childname);
+ i += childlen + 1;
+ }
+ else {
+ log("check_store: No child '%s' found!\n",
+ childname);
+
+ memdel(node->children, i, childlen + 1,
+ node->childlen);
+ node->childlen -= childlen + 1;
+ write_node(NULL, node);
+ }
+
+ talloc_free(childname);
+ }
+ }
+ else {
+ /* Impossible, because no database should ever be without the
+ root, and otherwise, we've just checked in our caller
+ (which made a recursive call to get here). */
+
+ log("check_store: No child '%s' found: impossible!", name);
+ }
+}
+
+
+static void check_store()
+{
+ char * root = talloc_strdup(NULL, "/");
+ log("Checking store ...");
+ check_store_(root);
+ log("Checking store complete.");
+ talloc_free(root);
+}
+
+
+/* Something is horribly wrong: check the store. */
+static void corrupt(struct connection *conn, const char *fmt, ...)
+{
+ va_list arglist;
+ char *str;
+ int saved_errno = errno;
+
+ va_start(arglist, fmt);
+ str = talloc_vasprintf(NULL, fmt, arglist);
+ va_end(arglist);
+
+ log("corruption detected by connection %i: err %s: %s",
+ conn ? (int)conn->id : -1, strerror(saved_errno), str);
+
+#ifdef TESTING
+ /* Allow them to attach debugger. */
+ sleep(30);
+#endif
+ check_store();
+}
+
static void write_pidfile(const char *pidfile)
{
diff -r 849723752858 -r 689da5e0a970 tools/xenstore/xenstored_core.h
--- a/tools/xenstore/xenstored_core.h Thu Mar 2 01:01:17 2006
+++ b/tools/xenstore/xenstored_core.h Thu Mar 2 01:09:23 2006
@@ -148,10 +148,6 @@
/* Replace the tdb: required for transaction code */
bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
-/* Fail due to excessive corruption, capitalist pigdogs! */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
- const char *fmt, ...);
-
struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
diff -r 849723752858 -r 689da5e0a970 tools/xenstore/xenstore_control.c
--- /dev/null Thu Mar 2 01:01:17 2006
+++ b/tools/xenstore/xenstore_control.c Thu Mar 2 01:09:23 2006
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xs.h"
+
+
+int main(int argc, char **argv)
+{
+ if (argc < 2 ||
+ strcmp(argv[1], "check"))
+ {
+ fprintf(stderr,
+ "Usage:\n"
+ "\n"
+ " %s check\n"
+ "\n", argv[0]);
+ return 2;
+ }
+
+ struct xs_handle * xsh = xs_daemon_open();
+
+ xs_debug_command(xsh, argv[1], NULL, 0);
+
+ xs_daemon_close(xsh);
+
+ return 0;
+}
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|