WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Added a basic integrity checker, and some basic ability

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] Added a basic integrity checker, and some basic ability to recover from store
From: Xen patchbot -3.0-testing <patchbot-3.0-testing@xxxxxxxxxxxxxxxxxxx>
Date: Fri, 10 Mar 2006 19:48:17 +0000
Delivery-date: Fri, 10 Mar 2006 19:49:22 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User emellor@xxxxxxxxxxxxxxxxxxxxxx
# Node ID afd7d4ba113bf96bca099111fa62335165bd2288
# Parent  2142adf67d6a62fbcb70388bbbcb591d438cfa17
Added a basic integrity checker, and some basic ability to recover from store
corruption, rather than just spewing error messages and exiting.

Added a xenstore-control executable, which sends commands to xenstored.
Currently, the only command is 'check', which triggers an integrity check.
(The integrity check is also triggered whenever a corrupted store is detected).

Signed-off-by: Ewan Mellor <ewan@xxxxxxxxxxxxx>

diff -r 2142adf67d6a -r afd7d4ba113b .hgignore
--- a/.hgignore Wed Mar  8 22:56:55 2006
+++ b/.hgignore Wed Mar  8 22:57:02 2006
@@ -162,6 +162,7 @@
 ^tools/xenstore/xenstore-read$
 ^tools/xenstore/xenstore-rm$
 ^tools/xenstore/xenstore-write$
+^tools/xenstore/xenstore-control$
 ^tools/xenstore/xenstore-ls$
 ^tools/xenstore/xenstored$
 ^tools/xenstore/xenstored_test$
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/Makefile
--- a/tools/xenstore/Makefile   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/Makefile   Wed Mar  8 22:57:02 2006
@@ -27,7 +27,10 @@
 CLIENTS += xenstore-write
 CLIENTS_OBJS := $(patsubst xenstore-%,xenstore_%.o,$(CLIENTS))
 
-all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-ls
+all: libxenstore.so xenstored $(CLIENTS) xs_tdb_dump xenstore-control 
xenstore-ls
+
+test_interleaved_transactions: test_interleaved_transactions.o
+       $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 testcode: xs_test xenstored_test xs_random
 
@@ -35,13 +38,16 @@
        $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -lxenctrl -o $@
 
 $(CLIENTS): xenstore-%: xenstore_%.o libxenstore.so
-       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 $(CLIENTS_OBJS): xenstore_%.o: xenstore_client.c
        $(COMPILE.c) -DCLIENT_$(*F) -o $@ $<
 
+xenstore-control: xenstore_control.o libxenstore.so
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
+
 xenstore-ls: xsls.o libxenstore.so
-       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -lxenctrl -L. -lxenstore -o $@
+       $(LINK.o) $< $(LOADLIBES) $(LDLIBS) -L. -lxenstore -o $@
 
 xenstored_test: xenstored_core_test.o xenstored_watch_test.o 
xenstored_domain_test.o xenstored_transaction_test.o xs_lib.o talloc_test.o 
fake_libxc.o utils.o tdb.o
        $(LINK.o) $^ $(LOADLIBES) $(LDLIBS) -o $@
@@ -77,7 +83,8 @@
 clean: testsuite-clean
        rm -f *.o *.opic *.so
        rm -f xenstored xs_random xs_stress xs_crashme
-       rm -f xs_test xenstored_test xs_tdb_dump xenstore-ls $(CLIENTS)
+       rm -f xs_test xenstored_test xs_tdb_dump xenstore-control xenstore-ls
+       rm -f $(CLIENTS)
        $(RM) $(PROG_DEP)
 
 print-dir:
@@ -129,7 +136,7 @@
 tarball: clean
        cd .. && tar -c -j -v -h -f xenstore.tar.bz2 xenstore/
 
-install: libxenstore.so xenstored xenstore-ls $(CLIENTS)
+install: all
        $(INSTALL_DIR) -p $(DESTDIR)/var/run/xenstored
        $(INSTALL_DIR) -p $(DESTDIR)/var/lib/xenstored
        $(INSTALL_DIR) -p $(DESTDIR)/usr/bin
@@ -137,6 +144,7 @@
        $(INSTALL_DIR) -p $(DESTDIR)/usr/include
        $(INSTALL_PROG) xenstored $(DESTDIR)/usr/sbin
        $(INSTALL_PROG) $(CLIENTS) $(DESTDIR)/usr/bin
+       $(INSTALL_PROG) xenstore-control $(DESTDIR)/usr/bin
        $(INSTALL_PROG) xenstore-ls $(DESTDIR)/usr/bin
        $(INSTALL_DIR) -p $(DESTDIR)/usr/$(LIBDIR)
        $(INSTALL_DATA) libxenstore.so $(DESTDIR)/usr/$(LIBDIR)
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.c
--- a/tools/xenstore/xenstored_core.c   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstored_core.c   Wed Mar  8 22:57:02 2006
@@ -60,6 +60,18 @@
 static char *tracefile = NULL;
 static TDB_CONTEXT *tdb_ctx;
 
+static void corrupt(struct connection *conn, const char *fmt, ...);
+static void check_store();
+
+#define log(...)                                                       \
+       do {                                                            \
+               char *s = talloc_asprintf(NULL, __VA_ARGS__);           \
+               trace("%s\n", s);                                       \
+               syslog(LOG_ERR, "%s",  s);                              \
+               talloc_free(s);                                         \
+       } while (0)
+
+
 #ifdef TESTING
 static bool failtest = false;
 
@@ -103,33 +115,6 @@
 #endif /* TESTING */
 
 #include "xenstored_test.h"
-
-/* FIXME: Ideally, this should never be called.  Some can be eliminated. */
-/* Something is horribly wrong: shutdown immediately. */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
-                                      const char *fmt, ...)
-{
-       va_list arglist;
-       char *str;
-       int saved_errno = errno;
-
-       va_start(arglist, fmt);
-       str = talloc_vasprintf(NULL, fmt, arglist);
-       va_end(arglist);
-
-       trace("xenstored corruption: connection id %i: err %s: %s",
-               conn ? (int)conn->id : -1, strerror(saved_errno), str);
-       eprintf("xenstored corruption: connection id %i: err %s: %s",
-               conn ? (int)conn->id : -1, strerror(saved_errno), str);
-#ifdef TESTING
-       /* Allow them to attach debugger. */
-       sleep(30);
-#endif
-       syslog(LOG_DAEMON,
-              "xenstored corruption: connection id %i: err %s: %s",
-              conn ? (int)conn->id : -1, strerror(saved_errno), str);
-       _exit(2);
-}
 
 TDB_CONTEXT *tdb_context(struct connection *conn)
 {
@@ -216,7 +201,8 @@
        now = time(NULL);
        tm = localtime(&now);
 
-       trace("%s %p %02d:%02d:%02d %s (", prefix, conn,
+       trace("%s %p %p %04d%02d%02d %02d:%02d:%02d %s (", prefix, conn,
+             conn->transaction, tm->year + 1900, tm->mon + 1, tm->mday,
              tm->tm_hour, tm->tm_min, tm->tm_sec,
              sockmsg_string(data->hdr.msg.type));
        
@@ -838,8 +824,6 @@
        return 0;
 }
 
-/* Be careful: create heirarchy, put entry in existing parent *last*.
- * This helps fsck if we die during this. */
 static struct node *create_node(struct connection *conn, 
                                const char *name,
                                void *data, unsigned int datalen)
@@ -940,8 +924,9 @@
 {
        unsigned int i;
 
-       /* Delete self, then delete children.  If something goes wrong,
-        * consistency check will clean up this way. */
+       /* Delete self, then delete children.  If we crash, then the worst
+          that can happen is the children will continue to take up space, but
+          will otherwise be unreachable. */
        delete_node_single(conn, node);
 
        /* Delete children, too. */
@@ -951,9 +936,14 @@
                child = read_node(conn, 
                                  talloc_asprintf(node, "%s/%s", node->name,
                                                  node->children + i));
-               if (!child)
-                       corrupt(conn, "No child '%s' found", child);
-               delete_node(conn, child);
+               if (child) {
+                       delete_node(conn, child);
+               }
+               else {
+                       trace("delete_node: No child '%s/%s' found!\n",
+                             node->name, node->children + i);
+                       /* Skip it, we've already deleted the parent. */
+               }
        }
 }
 
@@ -977,12 +967,15 @@
                }
        }
        corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
+       return false;
 }
 
 
 static int _rm(struct connection *conn, struct node *node, const char *name)
 {
-       /* Delete from parent first, then if something explodes fsck cleans. */
+       /* Delete from parent first, then if we crash, the worst that can
+          happen is the child will continue to take up space, but will
+          otherwise be unreachable. */
        struct node *parent = read_node(conn, get_parent(name));
        if (!parent) {
                send_error(conn, EINVAL);
@@ -1001,10 +994,11 @@
 
 static void internal_rm(const char *name)
 {
-       char *tname = talloc_strdup(talloc_autofree_context(), name);
+       char *tname = talloc_strdup(NULL, name);
        struct node *node = read_node(NULL, tname);
        if (node)
                _rm(NULL, node, tname);
+       talloc_free(tname);
 }
 
 
@@ -1150,18 +1144,19 @@
        case XS_DEBUG:
                if (streq(in->buffer, "print"))
                        xprintf("debug: %s", in->buffer + get_string(in, 0));
+               if (streq(in->buffer, "check"))
+                       check_store();
 #ifdef TESTING
                /* For testing, we allow them to set id. */
                if (streq(in->buffer, "setid")) {
                        conn->id = atoi(in->buffer + get_string(in, 0));
-                       send_ack(conn, XS_DEBUG);
                } else if (streq(in->buffer, "failtest")) {
                        if (get_string(in, 0) < in->used)
                                srandom(atoi(in->buffer + get_string(in, 0)));
-                       send_ack(conn, XS_DEBUG);
                        failtest = true;
                }
 #endif /* TESTING */
+               send_ack(conn, XS_DEBUG);
                break;
 
        case XS_WATCH:
@@ -1259,7 +1254,7 @@
 
                if (in->hdr.msg.len > PATH_MAX) {
 #ifndef TESTING
-                       syslog(LOG_DAEMON, "Client tried to feed us %i",
+                       syslog(LOG_ERR, "Client tried to feed us %i",
                               in->hdr.msg.len);
 #endif
                        goto bad_client;
@@ -1430,10 +1425,16 @@
                   balloon driver will pick up stale entries.  In the case of
                   the balloon driver, this can be fatal.
                */
-               char *tlocal = talloc_strdup(talloc_autofree_context(),
-                                            "/local");
+               char *tlocal = talloc_strdup(NULL, "/local");
+
+               check_store();
+
                internal_rm("/local");
                create_node(NULL, tlocal, NULL, 0);
+
+               talloc_free(tlocal);
+
+               check_store();
        }
        else {
                tdb_ctx = tdb_open(tdbname, 7919, TDB_FLAGS, O_RDWR|O_CREAT,
@@ -1444,10 +1445,92 @@
                manual_node("/", "tool");
                manual_node("/tool", "xenstored");
                manual_node("/tool/xenstored", NULL);
-       }
-
-       /* FIXME: Fsck */
-}
+
+               check_store();
+       }
+}
+
+static char *child_name(const char *s1, const char *s2)
+{
+       if (strcmp(s1, "/")) {
+               return talloc_asprintf(NULL, "%s/%s", s1, s2);
+       }
+       else {
+               return talloc_asprintf(NULL, "/%s", s2);
+       }
+}
+
+static void check_store_(const char *name)
+{
+       struct node *node = read_node(NULL, name);
+
+       if (node) {
+               size_t i = 0;
+
+               while (i < node->childlen) {
+                       size_t childlen = strlen(node->children + i);
+                       char * childname = child_name(node->name,
+                                                     node->children + i);
+                       struct node *childnode = read_node(NULL, childname);
+                       
+                       if (childnode) {
+                               check_store_(childname);
+                               i += childlen + 1;
+                       }
+                       else {
+                               log("check_store: No child '%s' found!\n",
+                                   childname);
+
+                               memdel(node->children, i, childlen + 1,
+                                      node->childlen);
+                               node->childlen -= childlen + 1;
+                               write_node(NULL, node);
+                       }
+
+                       talloc_free(childname);
+               }
+       }
+       else {
+               /* Impossible, because no database should ever be without the
+                  root, and otherwise, we've just checked in our caller
+                  (which made a recursive call to get here). */
+                  
+               log("check_store: No child '%s' found: impossible!", name);
+       }
+}
+
+
+static void check_store()
+{
+       char * root = talloc_strdup(NULL, "/");
+       log("Checking store ...");
+       check_store_(root);
+       log("Checking store complete.");
+       talloc_free(root);
+}
+
+
+/* Something is horribly wrong: check the store. */
+static void corrupt(struct connection *conn, const char *fmt, ...)
+{
+       va_list arglist;
+       char *str;
+       int saved_errno = errno;
+
+       va_start(arglist, fmt);
+       str = talloc_vasprintf(NULL, fmt, arglist);
+       va_end(arglist);
+
+       log("corruption detected by connection %i: err %s: %s",
+           conn ? (int)conn->id : -1, strerror(saved_errno), str);
+
+#ifdef TESTING
+       /* Allow them to attach debugger. */
+       sleep(30);
+#endif
+       check_store();
+}
+
 
 static void write_pidfile(const char *pidfile)
 {
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstored_core.h
--- a/tools/xenstore/xenstored_core.h   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstored_core.h   Wed Mar  8 22:57:02 2006
@@ -148,10 +148,6 @@
 /* Replace the tdb: required for transaction code */
 bool replace_tdb(const char *newname, TDB_CONTEXT *newtdb);
 
-/* Fail due to excessive corruption, capitalist pigdogs! */
-void __attribute__((noreturn)) corrupt(struct connection *conn,
-                                      const char *fmt, ...);
-
 struct connection *new_connection(connwritefn_t *write, connreadfn_t *read);
 
 
diff -r 2142adf67d6a -r afd7d4ba113b tools/xenstore/xenstore_control.c
--- /dev/null   Wed Mar  8 22:56:55 2006
+++ b/tools/xenstore/xenstore_control.c Wed Mar  8 22:57:02 2006
@@ -0,0 +1,28 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xs.h"
+
+
+int main(int argc, char **argv)
+{
+  if (argc < 2 ||
+      strcmp(argv[1], "check"))
+  {
+    fprintf(stderr,
+            "Usage:\n"
+            "\n"
+            "       %s check\n"
+            "\n", argv[0]);
+    return 2;
+  }
+
+  struct xs_handle * xsh = xs_daemon_open();
+
+  xs_debug_command(xsh, argv[1], NULL, 0);
+
+  xs_daemon_close(xsh);
+
+  return 0;
+}

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>