[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH 3/4] stubdom/grub: send kernel measurements to vTPM



On 11/27/2012 10:14 AM, Daniel De Graaf wrote:
This allows a domU with an arbitrary kernel and initrd to take advantage
of the static root of trust provided by a vTPM.
Do you have any documentation updates for how to use it? They could be added to the vtpm documentation if thats the most appropriate location.

How exactly does this work? Is it intended for HVM domains whos stubdom is connected to a vtpm? I've never tried to use vtpm with HVMs yet, so I'd be very surprised if it just works. Does it work for PVM domains?

I'm not terribly familiar with how grub in stubdom works.

Signed-off-by: Daniel De Graaf <dgdegra@xxxxxxxxxxxxx>
---
  stubdom/grub/Makefile   |   2 +-
  stubdom/grub/kexec.c    |  62 ++++++++++++
  stubdom/grub/minios.cfg |   1 +
  stubdom/grub/sha1.c     | 260 ++++++++++++++++++++++++++++++++++++++++++++++++
  4 files changed, 324 insertions(+), 1 deletion(-)
  create mode 100644 stubdom/grub/sha1.c

diff --git a/stubdom/grub/Makefile b/stubdom/grub/Makefile
index d6e3a1e..f1b5c3e 100644
--- a/stubdom/grub/Makefile
+++ b/stubdom/grub/Makefile
@@ -59,7 +59,7 @@ NETBOOT_SOURCES:=$(addprefix netboot/,$(NETBOOT_SOURCES))

  $(BOOT): DEF_CPPFLAGS+=-D__ASSEMBLY__

-PV_GRUB_SOURCES = kexec.c mini-os.c
+PV_GRUB_SOURCES = sha1.c kexec.c mini-os.c

  SOURCES = $(NETBOOT_SOURCES) $(STAGE2_SOURCES) $(PV_GRUB_SOURCES)

diff --git a/stubdom/grub/kexec.c b/stubdom/grub/kexec.c
index b21c91a..ba48cb7 100644
--- a/stubdom/grub/kexec.c
+++ b/stubdom/grub/kexec.c
@@ -117,6 +117,66 @@ int kexec_allocate(struct xc_dom_image *dom, xen_vaddr_t 
up_to)
      return 0;
  }

+static inline uint16_t be16(uint16_t v)
+{
+       return (v >> 8) | (v << 8);
+}
+
+static inline uint32_t be32(uint32_t v)
+{
+       return (be16(v) << 16) | be16(v >> 16);
+}
+
+void sha_compute(uint32_t *buf, void *src, uint32_t len);
+
+#include <tpmfront.h>
+
+#define TPM_TAG_RQU_COMMAND 0xC1
+#define TPM_ORD_Extend 20
+
+struct pcr_extend_cmd {
+       uint16_t tag;
+       uint32_t size;
+       uint32_t ord;
+
+       uint32_t pcr;
+       uint32_t hash[5];
+} __attribute__((packed));
+
+static void tpm_hash2pcr(struct xc_dom_image *dom, char *cmdline)
+{
+       struct tpmfront_dev* tpm = init_tpmfront(NULL);
+       uint8_t *resp;
+       size_t resplen = 0;
+       struct pcr_extend_cmd cmd;
+
+       /* If all guests have access to a vTPM, it may be useful to replace this
+        * with ASSERT(tpm) to prevent configuration errors from allowing a 
guest
+        * to boot without a TPM (or with a TPM that has not been sent any
+        * measurements, which could allow forging the measurements).
+        */
+       if (!tpm)
+               return;
+
+       cmd.tag = be16(TPM_TAG_RQU_COMMAND);
+       cmd.size = be32(sizeof(cmd));
+       cmd.ord = be32(TPM_ORD_Extend);
+       cmd.pcr = be32(4); // PCR #4 for kernel
+       sha_compute(cmd.hash, dom->kernel_blob, dom->kernel_size);
+
+       tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);
+
+       cmd.pcr = be32(5); // PCR #5 for cmdline
+       sha_compute(cmd.hash, cmdline, strlen(cmdline));
+       tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);
+
+       cmd.pcr = be32(5); // PCR #5 for initrd
+       sha_compute(cmd.hash, dom->ramdisk_blob, dom->ramdisk_size);
+       tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), &resp, &resplen);
+
+       shutdown_tpmfront(tpm, 1);
+}
Does this actually work? vtpm-stubdom will shut itself down when the frontend tpm disconnects. Is it really ok to connect and disconnect these devices at will?
+
  void kexec(void *kernel, long kernel_size, void *module, long module_size, 
char *cmdline, unsigned long flags)
  {
      struct xc_dom_image *dom;
@@ -151,6 +211,8 @@ void kexec(void *kernel, long kernel_size, void *module, 
long module_size, char
      dom->console_evtchn = start_info.console.domU.evtchn;
      dom->xenstore_evtchn = start_info.store_evtchn;

+    tpm_hash2pcr(dom, cmdline);
+
      if ( (rc = xc_dom_boot_xen_init(dom, xc_handle, domid)) != 0 ) {
          grub_printf("xc_dom_boot_xen_init returned %d\n", rc);
          errnum = ERR_BOOT_FAILURE;
diff --git a/stubdom/grub/minios.cfg b/stubdom/grub/minios.cfg
index 40cfa68..8df4909 100644
--- a/stubdom/grub/minios.cfg
+++ b/stubdom/grub/minios.cfg
@@ -1,2 +1,3 @@
  CONFIG_START_NETWORK=n
  CONFIG_SPARSE_BSS=n
+CONFIG_TPMFRONT=y
diff --git a/stubdom/grub/sha1.c b/stubdom/grub/sha1.c
new file mode 100644
index 0000000..2ad2e07
--- /dev/null
+++ b/stubdom/grub/sha1.c
@@ -0,0 +1,260 @@
+/*
+ * SHA1 routine optimized to do word accesses rather than byte accesses,
+ * and to avoid unnecessary copies into the context array.
+ *
+ * This was based on the git SHA1 implementation.
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+static inline uint16_t be16(uint16_t v)
+{
+       return (v >> 8) | (v << 8);
+}
+
+static inline uint32_t be32(uint32_t v)
+{
+       return (be16(v) << 16) | be16(v >> 16);
+}
+
+static inline uint32_t get_unaligned_be32(uint32_t *v)
+{
+       return be32(*v);
+}
These exist in mini-os byteorder.h now.
+
+/*
+ * rol32 - rotate a 32-bit value left
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline uint32_t rol32(uint32_t word, unsigned int shift)
+{
+    return (word << shift) | (word >> (32 - shift));
+}
+
+/*
+ * ror32 - rotate a 32-bit value right
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+static inline uint32_t ror32(uint32_t word, unsigned int shift)
+{
+    return (word >> shift) | (word << (32 - shift));
+}
+
+
+/*
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ *
+ * Ben Herrenschmidt reports that on PPC, the C version comes close
+ * to the optimized asm with this (ie on PPC you don't want that
+ * 'volatile', since there are lots of registers).
+ *
+ * On ARM we get the best code generation by forcing a full memory barrier
+ * between each SHA_ROUND, otherwise gcc happily get wild with spilling and
+ * the stack frame size simply explode and performance goes down the drain.
+ */
+
+#if 1
+  #define setW(x, val) (*(volatile uint32_t *)&W(x) = (val))
+#else
+  #define setW(x, val) (W(x) = (val))
+#endif
+
+/* This "rolls" over the 512-bit array */
+#define W(x) (array[(x)&15])
+
+/*
+ * Where do we get the source from? The first 16 iterations get it from
+ * the input data, the next mix it from the 512-bit array.
+ */
+#define SHA_SRC(t) get_unaligned_be32((uint32_t *)data + t)
+#define SHA_MIX(t) rol32(W(t+13) ^ W(t+8) ^ W(t+2) ^ W(t), 1)
+
+#define SHA_ROUND(t, input, fn, constant, A, B, C, D, E) do { \
+       uint32_t TEMP = input(t); setW(t, TEMP); \
+       E += TEMP + rol32(A,5) + (fn) + (constant); \
+       B = ror32(B, 2); } while (0)
+
+#define T_0_15(t, A, B, C, D, E)  SHA_ROUND(t, SHA_SRC, (((C^D)&B)^D) , 
0x5a827999, A, B, C, D, E )
+#define T_16_19(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (((C^D)&B)^D) , 
0x5a827999, A, B, C, D, E )
+#define T_20_39(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) , 0x6ed9eba1, 
A, B, C, D, E )
+#define T_40_59(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, ((B&C)+(D&(B^C))) , 
0x8f1bbcdc, A, B, C, D, E )
+#define T_60_79(t, A, B, C, D, E) SHA_ROUND(t, SHA_MIX, (B^C^D) ,  0xca62c1d6, 
A, B, C, D, E )
+
+/**
+ * sha_transform - single block SHA1 transform
+ *
+ * @digest: 160 bit digest to update
+ * @data:   512 bits of data to hash
+ * @array:  16 words of workspace (see note)
+ *
+ * This function generates a SHA1 digest for a single 512-bit block.
+ * Be warned, it does not handle padding and message digest, do not
+ * confuse it with the full FIPS 180-1 digest algorithm for variable
+ * length messages.
+ *
+ * Note: If the hash is security sensitive, the caller should be sure
+ * to clear the workspace. This is left to the caller to avoid
+ * unnecessary clears between chained hashing operations.
+ */
+void sha_transform(uint32_t *digest, const char *data, uint32_t *array)
+{
+       uint32_t A, B, C, D, E;
+
+       A = digest[0];
+       B = digest[1];
+       C = digest[2];
+       D = digest[3];
+       E = digest[4];
+
+       /* Round 1 - iterations 0-16 take their input from 'data' */
+       T_0_15( 0, A, B, C, D, E);
+       T_0_15( 1, E, A, B, C, D);
+       T_0_15( 2, D, E, A, B, C);
+       T_0_15( 3, C, D, E, A, B);
+       T_0_15( 4, B, C, D, E, A);
+       T_0_15( 5, A, B, C, D, E);
+       T_0_15( 6, E, A, B, C, D);
+       T_0_15( 7, D, E, A, B, C);
+       T_0_15( 8, C, D, E, A, B);
+       T_0_15( 9, B, C, D, E, A);
+       T_0_15(10, A, B, C, D, E);
+       T_0_15(11, E, A, B, C, D);
+       T_0_15(12, D, E, A, B, C);
+       T_0_15(13, C, D, E, A, B);
+       T_0_15(14, B, C, D, E, A);
+       T_0_15(15, A, B, C, D, E);
+
+       /* Round 1 - tail. Input from 512-bit mixing array */
+       T_16_19(16, E, A, B, C, D);
+       T_16_19(17, D, E, A, B, C);
+       T_16_19(18, C, D, E, A, B);
+       T_16_19(19, B, C, D, E, A);
+
+       /* Round 2 */
+       T_20_39(20, A, B, C, D, E);
+       T_20_39(21, E, A, B, C, D);
+       T_20_39(22, D, E, A, B, C);
+       T_20_39(23, C, D, E, A, B);
+       T_20_39(24, B, C, D, E, A);
+       T_20_39(25, A, B, C, D, E);
+       T_20_39(26, E, A, B, C, D);
+       T_20_39(27, D, E, A, B, C);
+       T_20_39(28, C, D, E, A, B);
+       T_20_39(29, B, C, D, E, A);
+       T_20_39(30, A, B, C, D, E);
+       T_20_39(31, E, A, B, C, D);
+       T_20_39(32, D, E, A, B, C);
+       T_20_39(33, C, D, E, A, B);
+       T_20_39(34, B, C, D, E, A);
+       T_20_39(35, A, B, C, D, E);
+       T_20_39(36, E, A, B, C, D);
+       T_20_39(37, D, E, A, B, C);
+       T_20_39(38, C, D, E, A, B);
+       T_20_39(39, B, C, D, E, A);
+
+       /* Round 3 */
+       T_40_59(40, A, B, C, D, E);
+       T_40_59(41, E, A, B, C, D);
+       T_40_59(42, D, E, A, B, C);
+       T_40_59(43, C, D, E, A, B);
+       T_40_59(44, B, C, D, E, A);
+       T_40_59(45, A, B, C, D, E);
+       T_40_59(46, E, A, B, C, D);
+       T_40_59(47, D, E, A, B, C);
+       T_40_59(48, C, D, E, A, B);
+       T_40_59(49, B, C, D, E, A);
+       T_40_59(50, A, B, C, D, E);
+       T_40_59(51, E, A, B, C, D);
+       T_40_59(52, D, E, A, B, C);
+       T_40_59(53, C, D, E, A, B);
+       T_40_59(54, B, C, D, E, A);
+       T_40_59(55, A, B, C, D, E);
+       T_40_59(56, E, A, B, C, D);
+       T_40_59(57, D, E, A, B, C);
+       T_40_59(58, C, D, E, A, B);
+       T_40_59(59, B, C, D, E, A);
+
+       /* Round 4 */
+       T_60_79(60, A, B, C, D, E);
+       T_60_79(61, E, A, B, C, D);
+       T_60_79(62, D, E, A, B, C);
+       T_60_79(63, C, D, E, A, B);
+       T_60_79(64, B, C, D, E, A);
+       T_60_79(65, A, B, C, D, E);
+       T_60_79(66, E, A, B, C, D);
+       T_60_79(67, D, E, A, B, C);
+       T_60_79(68, C, D, E, A, B);
+       T_60_79(69, B, C, D, E, A);
+       T_60_79(70, A, B, C, D, E);
+       T_60_79(71, E, A, B, C, D);
+       T_60_79(72, D, E, A, B, C);
+       T_60_79(73, C, D, E, A, B);
+       T_60_79(74, B, C, D, E, A);
+       T_60_79(75, A, B, C, D, E);
+       T_60_79(76, E, A, B, C, D);
+       T_60_79(77, D, E, A, B, C);
+       T_60_79(78, C, D, E, A, B);
+       T_60_79(79, B, C, D, E, A);
+
+       digest[0] += A;
+       digest[1] += B;
+       digest[2] += C;
+       digest[3] += D;
+       digest[4] += E;
+}
+
+/**
+ * sha_init - initialize the vectors for a SHA1 digest
+ * @buf: vector to initialize
+ */
+void sha_init(uint32_t *buf)
+{
+       buf[0] = 0x67452301;
+       buf[1] = 0xefcdab89;
+       buf[2] = 0x98badcfe;
+       buf[3] = 0x10325476;
+       buf[4] = 0xc3d2e1f0;
+}
+
+void sha_compute(uint32_t *buf, void *src, uint32_t len)
+{
+       uint32_t pos = 0;
+       uint8_t final[64];
+       uint32_t work[16];
+       sha_init(buf);
+       while (len >= pos + 64) {
+               sha_transform(buf, src + pos, work);
+               pos += 64;
+       }
+       memcpy(final, src + pos, len - pos);
+       // done with src; pos is now relative to final
+       pos = len - pos;
+       final[pos++] = 0x80;
+       memset(final + pos, 0, sizeof(final) - pos);
+       if (pos > 56) {
+               sha_transform(buf, (void*)final, work);
+               memset(final, 0, sizeof(final));
+       }
+       *(uint32_t*)(final + 60) = be32(len << 3);
+       sha_transform(buf, (void*)final, work);
+       buf[0] = be32(buf[0]);
+       buf[1] = be32(buf[1]);
+       buf[2] = be32(buf[2]);
+       buf[3] = be32(buf[3]);
+       buf[4] = be32(buf[4]);
+}
There also exists a sha1 routine in polarssl. You can just link in the polarssl sha1 object file without the entire library. This is what vtpm-stubdom and vtpmmgrdom do to get the crypto pieces they need. Checkout their makefiles for details.
Is there any reason using polarssl would be sub-optimal?
--
1.7.11.7



Attachment: smime.p7s
Description: S/MIME Cryptographic Signature

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.