WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] tmem (hypervisor-side): ABI v1 to handle long object

To: "Xen-Devel (xen-devel@xxxxxxxxxxxxxxxxxxx)" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] tmem (hypervisor-side): ABI v1 to handle long object-ids (XEN-4.0-TESTING and XEN-UNSTABLE)
From: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>
Date: Fri, 3 Sep 2010 08:47:51 -0700 (PDT)
Cc: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>, Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Delivery-date: Fri, 03 Sep 2010 08:52:33 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
[PATCH] tmem (hypervisor-side): move to new ABI version to handle 
long object-ids

Please apply patch to both xen-4.0-testing and xen-unstable
(same patch applies cleanly to both).

(Note to Keir/Ian: These patches should be applied
together, but I'm not clear on how to submit patches
that cross MAINTAINERS boundaries as this one does.)

After a great deal of discussion and review with linux
kernel developers, it appears there are "next-generation"
filesystems (such as btrfs, xfs, Lustre) that will not
be able to use tmem due to an ABI limitation... a field
that represents a unique file identifier is 64-bits in
the tmem ABI and may need to be as large as 192-bits.
So to support these guest filesystems, the tmem ABI must be
revised, from "v0" to "v1".

I *think* it is still the case that tmem is experimental
and is not used anywhere yet in production.

The tmem ABI is designed to support multiple revisions,
so the Xen tmem implementation could be updated to
handle both v0 and v1.  However this is a bit
messy and would require data structures for both v0
and v1 to appear in public Xen header files.

I am inclined to update the Xen tmem implementation
to only support v1 and gracefully fail v0.  This would
result in only a performance loss (as if tmem were
disabled) for newly launched tmem-v0-enabled guests,
but live-migration between old tmem-v0 Xen and new
tmem-v1 Xen machines would fail, and saved tmem-v0
guests will not be able to be restored on a tmem-v1
Xen machine.  I would plan to update both pre-4.0.2
and unstable (future 4.1) to only support v1.

I believe these restrictions are reasonable at this
point in the tmem lifecycle, though they may not
be reasonable in the near future; should the tmem
ABI need to be revised from v1 to v2, I understand
backwards compatibility will be required.

Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx>

diff -r 07ac5459b250 xen/common/tmem.c
--- a/xen/common/tmem.c Wed Aug 25 09:23:31 2010 +0100
+++ b/xen/common/tmem.c Thu Sep 02 16:43:33 2010 -0600
@@ -26,7 +26,7 @@
 #define EXPORT /* indicates code other modules are dependent upon */
 #define FORWARD
 
-#define TMEM_SPEC_VERSION 0
+#define TMEM_SPEC_VERSION 1
 
 /************  INTERFACE TO TMEM HOST-DEPENDENT (tmh) CODE ************/
 
@@ -149,14 +149,13 @@ typedef struct share_list sharelist_t;
 
 #define OBJ_HASH_BUCKETS 256 /* must be power of two */
 #define OBJ_HASH_BUCKETS_MASK (OBJ_HASH_BUCKETS-1)
-#define OBJ_HASH(_oid) (tmh_hash(_oid, BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK)
 
 struct tm_pool {
     bool_t shared;
     bool_t persistent;
     bool_t is_dying;
     int pageshift; /* 0 == 2**12 */
-    struct list_head pool_list; /* FIXME do we need this anymore? */
+    struct list_head pool_list;
     client_t *client;
     uint64_t uuid[2]; /* 0 for private, non-zero for shared */
     uint32_t pool_id;
@@ -189,9 +188,14 @@ typedef struct tm_pool pool_t;
 #define is_shared(_p)      (_p->shared)
 #define is_private(_p)     (!(_p->shared))
 
+struct oid {
+    uint64_t oid[3];
+};
+typedef struct oid OID;
+
 struct tmem_object_root {
     DECL_SENTINEL
-    uint64_t oid;
+    OID oid;
     struct rb_node rb_tree_node; /* protected by pool->pool_rwlock */
     unsigned long objnode_count; /* atomicity depends on obj_spinlock */
     long pgp_count; /* atomicity depends on obj_spinlock */
@@ -217,12 +221,14 @@ struct tmem_page_descriptor {
         struct list_head client_inv_pages;
     };
     union {
-        struct list_head client_eph_pages;
-        struct list_head pool_pers_pages;
-    };
-    union {
-        obj_t *obj;
-        uint64_t inv_oid;  /* used for invalid list only */
+        struct {
+            union {
+                struct list_head client_eph_pages;
+                struct list_head pool_pers_pages;
+            };
+            obj_t *obj;
+        } us;
+        OID inv_oid;  /* used for invalid list only */
     };
     pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
                     else compressed data (cdata) */
@@ -467,9 +473,9 @@ static NOINLINE int pcd_associate(pgp_t 
 
     if ( !tmh_dedup_enabled() )
         return 0;
-    ASSERT(pgp->obj != NULL);
-    ASSERT(pgp->obj->pool != NULL);
-    ASSERT(!pgp->obj->pool->persistent);
+    ASSERT(pgp->us.obj != NULL);
+    ASSERT(pgp->us.obj->pool != NULL);
+    ASSERT(!pgp->us.obj->pool->persistent);
     if ( cdata == NULL )
     {
         ASSERT(pgp->pfp != NULL);
@@ -528,7 +534,7 @@ static NOINLINE int pcd_associate(pgp_t 
             /* match! if not compressed, free the no-longer-needed page */
             /* but if compressed, data is assumed static so don't free! */
             if ( cdata == NULL )
-                tmem_page_free(pgp->obj->pool,pgp->pfp);
+                tmem_page_free(pgp->us.obj->pool,pgp->pfp);
             deduped_puts++;
             goto match;
         }
@@ -540,7 +546,7 @@ static NOINLINE int pcd_associate(pgp_t 
         ret = -ENOMEM;
         goto unlock;
     } else if ( cdata != NULL ) {
-        if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->obj->pool)) == NULL )
+        if ( (pcd->cdata = tmem_malloc_bytes(csize,pgp->us.obj->pool)) == NULL 
)
         {
             tmem_free(pcd,sizeof(pcd_t),NULL);
             ret = -ENOMEM;
@@ -561,11 +567,11 @@ static NOINLINE int pcd_associate(pgp_t 
         pcd->size = 0;
         pcd->tze = NULL;
     } else if ( pfp_size < PAGE_SIZE &&
-         ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->obj->pool)) != NULL) ) {
+         ((pcd->tze = tmem_malloc_bytes(pfp_size,pgp->us.obj->pool)) != NULL) 
) {
         tmh_tze_copy_from_pfp(pcd->tze,pgp->pfp,pfp_size);
         pcd->size = pfp_size;
         pcd_tot_tze_size += pfp_size;
-        tmem_page_free(pgp->obj->pool,pgp->pfp);
+        tmem_page_free(pgp->us.obj->pool,pgp->pfp);
     } else {
         pcd->pfp = pgp->pfp;
         pcd->size = PAGE_SIZE;
@@ -602,9 +608,9 @@ static NOINLINE pgp_t *pgp_alloc(obj_t *
     pool = obj->pool;
     if ( (pgp = tmem_malloc(pgp_t, pool)) == NULL )
         return NULL;
-    pgp->obj = obj;
+    pgp->us.obj = obj;
     INIT_LIST_HEAD(&pgp->global_eph_pages);
-    INIT_LIST_HEAD(&pgp->client_eph_pages);
+    INIT_LIST_HEAD(&pgp->us.client_eph_pages);
     pgp->pfp = NULL;
     if ( tmh_dedup_enabled() )
     {
@@ -642,7 +648,7 @@ static NOINLINE void pgp_free_data(pgp_t
     else if ( pgp_size )
         tmem_free(pgp->cdata,pgp_size,pool);
     else
-        tmem_page_free(pgp->obj->pool,pgp->pfp);
+        tmem_page_free(pgp->us.obj->pool,pgp->pfp);
     if ( pool != NULL && pgp_size )
     {
         pool->client->compressed_pages--;
@@ -657,18 +663,18 @@ static NOINLINE void pgp_free(pgp_t *pgp
     pool_t *pool = NULL;
 
     ASSERT_SENTINEL(pgp,PGD);
-    ASSERT(pgp->obj != NULL);
-    ASSERT_SENTINEL(pgp->obj,OBJ);
-    ASSERT_SENTINEL(pgp->obj->pool,POOL);
-    ASSERT(pgp->obj->pool->client != NULL);
+    ASSERT(pgp->us.obj != NULL);
+    ASSERT_SENTINEL(pgp->us.obj,OBJ);
+    ASSERT_SENTINEL(pgp->us.obj->pool,POOL);
+    ASSERT(pgp->us.obj->pool->client != NULL);
     if ( from_delete )
-        ASSERT(pgp_lookup_in_obj(pgp->obj,pgp->index) == NULL);
-    ASSERT(pgp->obj->pool != NULL);
-    pool = pgp->obj->pool;
+        ASSERT(pgp_lookup_in_obj(pgp->us.obj,pgp->index) == NULL);
+    ASSERT(pgp->us.obj->pool != NULL);
+    pool = pgp->us.obj->pool;
     if ( is_ephemeral(pool) )
     {
         ASSERT(list_empty(&pgp->global_eph_pages));
-        ASSERT(list_empty(&pgp->client_eph_pages));
+        ASSERT(list_empty(&pgp->us.client_eph_pages));
     }
     pgp_free_data(pgp, pool);
     atomic_dec_and_assert(global_pgp_count);
@@ -676,12 +682,12 @@ static NOINLINE void pgp_free(pgp_t *pgp
     pgp->size = -1;
     if ( is_persistent(pool) && pool->client->live_migrating )
     {
-        pgp->inv_oid = pgp->obj->oid;
+        pgp->inv_oid = pgp->us.obj->oid;
         pgp->pool_id = pool->pool_id;
         return;
     }
     INVERT_SENTINEL(pgp,PGD);
-    pgp->obj = NULL;
+    pgp->us.obj = NULL;
     pgp->index = -1;
     tmem_free(pgp,sizeof(pgp_t),pool);
 }
@@ -693,7 +699,7 @@ static NOINLINE void pgp_free_from_inv_l
     ASSERT_SENTINEL(pool,POOL);
     ASSERT_SENTINEL(pgp,PGD);
     INVERT_SENTINEL(pgp,PGD);
-    pgp->obj = NULL;
+    pgp->us.obj = NULL;
     pgp->index = -1;
     tmem_free(pgp,sizeof(pgp_t),pool);
 }
@@ -704,18 +710,18 @@ static void pgp_delist(pgp_t *pgp, bool_
     client_t *client;
 
     ASSERT(pgp != NULL);
-    ASSERT(pgp->obj != NULL);
-    ASSERT(pgp->obj->pool != NULL);
-    client = pgp->obj->pool->client;
+    ASSERT(pgp->us.obj != NULL);
+    ASSERT(pgp->us.obj->pool != NULL);
+    client = pgp->us.obj->pool->client;
     ASSERT(client != NULL);
-    if ( is_ephemeral(pgp->obj->pool) )
+    if ( is_ephemeral(pgp->us.obj->pool) )
     {
         if ( !no_eph_lock )
             tmem_spin_lock(&eph_lists_spinlock);
-        if ( !list_empty(&pgp->client_eph_pages) )
+        if ( !list_empty(&pgp->us.client_eph_pages) )
             client->eph_count--;
         ASSERT(client->eph_count >= 0);
-        list_del_init(&pgp->client_eph_pages);
+        list_del_init(&pgp->us.client_eph_pages);
         if ( !list_empty(&pgp->global_eph_pages) )
             global_eph_count--;
         ASSERT(global_eph_count >= 0);
@@ -728,12 +734,12 @@ static void pgp_delist(pgp_t *pgp, bool_
             tmem_spin_lock(&pers_lists_spinlock);
             list_add_tail(&pgp->client_inv_pages,
                           &client->persistent_invalidated_list);
-            if ( pgp != pgp->obj->pool->cur_pgp )
-                list_del_init(&pgp->pool_pers_pages);
+            if ( pgp != pgp->us.obj->pool->cur_pgp )
+                list_del_init(&pgp->us.pool_pers_pages);
             tmem_spin_unlock(&pers_lists_spinlock);
         } else {
             tmem_spin_lock(&pers_lists_spinlock);
-            list_del_init(&pgp->pool_pers_pages);
+            list_del_init(&pgp->us.pool_pers_pages);
             tmem_spin_unlock(&pers_lists_spinlock);
         }
     }
@@ -745,10 +751,10 @@ static NOINLINE void pgp_delete(pgp_t *p
     uint64_t life;
 
     ASSERT(pgp != NULL);
-    ASSERT(pgp->obj != NULL);
-    ASSERT(pgp->obj->pool != NULL);
+    ASSERT(pgp->us.obj != NULL);
+    ASSERT(pgp->us.obj->pool != NULL);
     life = get_cycles() - pgp->timestamp;
-    pgp->obj->pool->sum_life_cycles += life;
+    pgp->us.obj->pool->sum_life_cycles += life;
     pgp_delist(pgp, no_eph_lock);
     pgp_free(pgp,1);
 }
@@ -758,11 +764,11 @@ static NOINLINE void pgp_destroy(void *v
 {
     pgp_t *pgp = (pgp_t *)v;
 
-    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
+    ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
     pgp_delist(pgp,0);
-    ASSERT(pgp->obj != NULL);
-    pgp->obj->pgp_count--;
-    ASSERT(pgp->obj->pgp_count >= 0);
+    ASSERT(pgp->us.obj != NULL);
+    pgp->us.obj->pgp_count--;
+    ASSERT(pgp->us.obj->pgp_count >= 0);
     pgp_free(pgp,0);
 }
 
@@ -849,37 +855,74 @@ static void rtn_free(rtn_t *rtn)
 
 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
 
+int oid_compare(OID *left, OID *right)
+{
+    if ( left->oid[2] == right->oid[2] )
+    {
+        if ( left->oid[1] == right->oid[1] )
+        {
+            if ( left->oid[0] == right->oid[0] )
+                return 0;
+            else if ( left->oid[0] < left->oid[0] )
+                return -1;
+            else
+                return 1;
+        }
+        else if ( left->oid[1] < left->oid[1] )
+            return -1;
+        else
+            return 1;
+    }
+    else if ( left->oid[2] < left->oid[2] )
+        return -1;
+    else
+        return 1;
+}
+
+void oid_set_invalid(OID *oidp)
+{
+    oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
+}
+
+unsigned oid_hash(OID *oidp)
+{
+    return (tmh_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
+                     BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
+}
+
 /* searches for object==oid in pool, returns locked object if found */
-static NOINLINE obj_t * obj_find(pool_t *pool, uint64_t oid)
+static NOINLINE obj_t * obj_find(pool_t *pool, OID *oidp)
 {
     struct rb_node *node;
     obj_t *obj;
 
 restart_find:
     tmem_read_lock(&pool->pool_rwlock);
-    node = pool->obj_rb_root[OBJ_HASH(oid)].rb_node;
+    node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
     while ( node )
     {
         obj = container_of(node, obj_t, rb_tree_node);
-        if ( obj->oid == oid )
+        switch ( oid_compare(&obj->oid, oidp) )
         {
-            if ( tmh_lock_all )
-                obj->no_evict = 1;
-            else
-            {
-                if ( !tmem_spin_trylock(&obj->obj_spinlock) )
+            case 0: /* equal */
+                if ( tmh_lock_all )
+                    obj->no_evict = 1;
+                else
                 {
+                    if ( !tmem_spin_trylock(&obj->obj_spinlock) )
+                    {
+                        tmem_read_unlock(&pool->pool_rwlock);
+                        goto restart_find;
+                    }
                     tmem_read_unlock(&pool->pool_rwlock);
-                    goto restart_find;
                 }
-                tmem_read_unlock(&pool->pool_rwlock);
-            }
-            return obj;
+                return obj;
+            case -1:
+                node = node->rb_left;
+                break;
+            case 1:
+                node = node->rb_right;
         }
-        else if ( oid < obj->oid )
-            node = node->rb_left;
-        else
-            node = node->rb_right;
     }
     tmem_read_unlock(&pool->pool_rwlock);
     return NULL;
@@ -889,7 +932,7 @@ static NOINLINE void obj_free(obj_t *obj
 static NOINLINE void obj_free(obj_t *obj, int no_rebalance)
 {
     pool_t *pool;
-    uint64_t old_oid;
+    OID old_oid;
 
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     ASSERT(obj != NULL);
@@ -908,12 +951,12 @@ static NOINLINE void obj_free(obj_t *obj
     INVERT_SENTINEL(obj,OBJ);
     obj->pool = NULL;
     old_oid = obj->oid;
-    obj->oid = -1;
+    oid_set_invalid(&obj->oid);
     obj->last_client = CLI_ID_NULL;
     atomic_dec_and_assert(global_obj_count);
     /* use no_rebalance only if all objects are being destroyed anyway */
     if ( !no_rebalance )
-        rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[OBJ_HASH(old_oid)]);
+        rb_erase(&obj->rb_tree_node,&pool->obj_rb_root[oid_hash(&old_oid)]);
     tmem_free(obj,sizeof(obj_t),pool);
 }
 
@@ -927,12 +970,17 @@ static NOINLINE int obj_rb_insert(struct
     {
         this = container_of(*new, obj_t, rb_tree_node);
         parent = *new;
-        if ( obj->oid < this->oid )
-            new = &((*new)->rb_left);
-        else if ( obj->oid > this->oid )
-            new = &((*new)->rb_right);
-        else
-            return 0;
+        switch ( oid_compare(&obj->oid, &this->oid) )
+        {
+            case 0:
+                return 0;
+            case -1:
+                new = &((*new)->rb_left);
+                break;
+            case 1:
+                new = &((*new)->rb_right);
+                break;
+        }
     }
     rb_link_node(&obj->rb_tree_node, parent, new);
     rb_insert_color(&obj->rb_tree_node, root);
@@ -943,7 +991,7 @@ static NOINLINE int obj_rb_insert(struct
  * allocate, initialize, and insert an tmem_object_root
  * (should be called only if find failed)
  */
-static NOINLINE obj_t * obj_new(pool_t *pool, uint64_t oid)
+static NOINLINE obj_t * obj_new(pool_t *pool, OID *oidp)
 {
     obj_t *obj;
 
@@ -958,13 +1006,13 @@ static NOINLINE obj_t * obj_new(pool_t *
     INIT_RADIX_TREE(&obj->tree_root,0);
     spin_lock_init(&obj->obj_spinlock);
     obj->pool = pool;
-    obj->oid = oid;
+    obj->oid = *oidp;
     obj->objnode_count = 0;
     obj->pgp_count = 0;
     obj->last_client = CLI_ID_NULL;
     SET_SENTINEL(obj,OBJ);
     tmem_spin_lock(&obj->obj_spinlock);
-    obj_rb_insert(&pool->obj_rb_root[OBJ_HASH(oid)], obj);
+    obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj);
     obj->no_evict = 1;
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     return obj;
@@ -1256,7 +1304,7 @@ static void client_freeze(client_t *clie
 
 static bool_t tmem_try_to_evict_pgp(pgp_t *pgp, bool_t *hold_pool_rwlock)
 {
-    obj_t *obj = pgp->obj;
+    obj_t *obj = pgp->us.obj;
     pool_t *pool = obj->pool;
     client_t *client = pool->client;
     uint16_t firstbyte = pgp->firstbyte;
@@ -1280,8 +1328,8 @@ static bool_t tmem_try_to_evict_pgp(pgp_
                 pgp->eviction_attempted++;
                 list_del(&pgp->global_eph_pages);
                 
list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
-                list_del(&pgp->client_eph_pages);
-                
list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+                list_del(&pgp->us.client_eph_pages);
+                
list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
                 goto pcd_unlock;
             }
         }
@@ -1314,7 +1362,7 @@ static int tmem_evict(void)
     if ( (client != NULL) && client_over_quota(client) &&
          !list_empty(&client->ephemeral_page_list) )
     {
-        
list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,client_eph_pages)
+        
list_for_each_entry_safe(pgp,pgp2,&client->ephemeral_page_list,us.client_eph_pages)
             if ( tmem_try_to_evict_pgp(pgp,&hold_pool_rwlock) )
                 goto found;
     } else if ( list_empty(&global_ephemeral_page_list) ) {
@@ -1331,7 +1379,7 @@ found:
 found:
     ASSERT(pgp != NULL);
     ASSERT_SENTINEL(pgp,PGD);
-    obj = pgp->obj;
+    obj = pgp->us.obj;
     ASSERT(obj != NULL);
     ASSERT(obj->no_evict == 0);
     ASSERT(obj->pool != NULL);
@@ -1407,16 +1455,16 @@ static NOINLINE int do_tmem_put_compress
     DECL_LOCAL_CYC_COUNTER(compress);
     
     ASSERT(pgp != NULL);
-    ASSERT(pgp->obj != NULL);
-    ASSERT_SPINLOCK(&pgp->obj->obj_spinlock);
-    ASSERT(pgp->obj->pool != NULL);
-    ASSERT(pgp->obj->pool->client != NULL);
+    ASSERT(pgp->us.obj != NULL);
+    ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
+    ASSERT(pgp->us.obj->pool != NULL);
+    ASSERT(pgp->us.obj->pool->client != NULL);
 #ifdef __i386__
     return -ENOMEM;
 #endif
 
     if ( pgp->pfp != NULL )
-        pgp_free_data(pgp, pgp->obj->pool);
+        pgp_free_data(pgp, pgp->us.obj->pool);
     START_CYC_COUNTER(compress);
     ret = tmh_compress_from_client(cmfn, &dst, &size, cva);
     if ( (ret == -EFAULT) || (ret == 0) )
@@ -1424,10 +1472,10 @@ static NOINLINE int do_tmem_put_compress
     else if ( (size == 0) || (size >= tmem_subpage_maxsize()) ) {
         ret = 0;
         goto out;
-    } else if ( tmh_dedup_enabled() && !is_persistent(pgp->obj->pool) ) {
+    } else if ( tmh_dedup_enabled() && !is_persistent(pgp->us.obj->pool) ) {
         if ( (ret = pcd_associate(pgp,dst,size)) == -ENOMEM )
             goto out;
-    } else if ( (p = tmem_malloc_bytes(size,pgp->obj->pool)) == NULL ) {
+    } else if ( (p = tmem_malloc_bytes(size,pgp->us.obj->pool)) == NULL ) {
         ret = -ENOMEM;
         goto out;
     } else {
@@ -1435,8 +1483,8 @@ static NOINLINE int do_tmem_put_compress
         pgp->cdata = p;
     }
     pgp->size = size;
-    pgp->obj->pool->client->compressed_pages++;
-    pgp->obj->pool->client->compressed_sum_size += size;
+    pgp->us.obj->pool->client->compressed_pages++;
+    pgp->us.obj->pool->client->compressed_sum_size += size;
     ret = 1;
 
 out:
@@ -1456,7 +1504,7 @@ static NOINLINE int do_tmem_dup_put(pgp_
     ASSERT(pgp != NULL);
     ASSERT(pgp->pfp != NULL);
     ASSERT(pgp->size != -1);
-    obj = pgp->obj;
+    obj = pgp->us.obj;
     ASSERT_SPINLOCK(&obj->obj_spinlock);
     ASSERT(obj != NULL);
     pool = obj->pool;
@@ -1535,7 +1583,7 @@ cleanup:
 
 
 static NOINLINE int do_tmem_put(pool_t *pool,
-              uint64_t oid, uint32_t index,
+              OID *oidp, uint32_t index,
               tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
               pagesize_t pfn_offset, pagesize_t len, void *cva)
 {
@@ -1547,7 +1595,7 @@ static NOINLINE int do_tmem_put(pool_t *
     ASSERT(pool != NULL);
     pool->puts++;
     /* does page already exist (dup)?  if so, handle specially */
-    if ( (obj = objfound = obj_find(pool,oid)) != NULL )
+    if ( (obj = objfound = obj_find(pool,oidp)) != NULL )
     {
         ASSERT_SPINLOCK(&objfound->obj_spinlock);
         if ((pgp = pgp_lookup_in_obj(objfound, index)) != NULL)
@@ -1561,7 +1609,7 @@ static NOINLINE int do_tmem_put(pool_t *
     if ( (objfound == NULL) )
     {
         tmem_write_lock(&pool->pool_rwlock);
-        if ( (obj = objnew = obj_new(pool,oid)) == NULL )
+        if ( (obj = objnew = obj_new(pool,oidp)) == NULL )
         {
             tmem_write_unlock(&pool->pool_rwlock);
             return -ENOMEM;
@@ -1627,14 +1675,14 @@ insert_page:
             &global_ephemeral_page_list);
         if (++global_eph_count > global_eph_count_max)
             global_eph_count_max = global_eph_count;
-        list_add_tail(&pgp->client_eph_pages,
+        list_add_tail(&pgp->us.client_eph_pages,
             &client->ephemeral_page_list);
         if (++client->eph_count > client->eph_count_max)
             client->eph_count_max = client->eph_count;
         tmem_spin_unlock(&eph_lists_spinlock);
     } else { /* is_persistent */
         tmem_spin_lock(&pers_lists_spinlock);
-        list_add_tail(&pgp->pool_pers_pages,
+        list_add_tail(&pgp->us.pool_pers_pages,
             &pool->persistent_page_list);
         tmem_spin_unlock(&pers_lists_spinlock);
     }
@@ -1678,7 +1726,7 @@ free:
     return ret;
 }
 
-static NOINLINE int do_tmem_get(pool_t *pool, uint64_t oid, uint32_t index,
+static NOINLINE int do_tmem_get(pool_t *pool, OID *oidp, uint32_t index,
               tmem_cli_mfn_t cmfn, pagesize_t tmem_offset,
               pagesize_t pfn_offset, pagesize_t len, void *cva)
 {
@@ -1691,7 +1739,7 @@ static NOINLINE int do_tmem_get(pool_t *
         return -EEMPTY;
 
     pool->gets++;
-    obj = obj_find(pool,oid);
+    obj = obj_find(pool,oidp);
     if ( obj == NULL )
         return 0;
 
@@ -1737,8 +1785,8 @@ static NOINLINE int do_tmem_get(pool_t *
             tmem_spin_lock(&eph_lists_spinlock);
             list_del(&pgp->global_eph_pages);
             list_add_tail(&pgp->global_eph_pages,&global_ephemeral_page_list);
-            list_del(&pgp->client_eph_pages);
-            list_add_tail(&pgp->client_eph_pages,&client->ephemeral_page_list);
+            list_del(&pgp->us.client_eph_pages);
+            
list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
             tmem_spin_unlock(&eph_lists_spinlock);
             ASSERT(obj != NULL);
             obj->last_client = tmh_get_cli_id_from_current();
@@ -1763,13 +1811,13 @@ bad_copy:
 
 }
 
-static NOINLINE int do_tmem_flush_page(pool_t *pool, uint64_t oid, uint32_t 
index)
+static NOINLINE int do_tmem_flush_page(pool_t *pool, OID *oidp, uint32_t index)
 {
     obj_t *obj;
     pgp_t *pgp;
 
     pool->flushs++;
-    obj = obj_find(pool,oid);
+    obj = obj_find(pool,oidp);
     if ( obj == NULL )
         goto out;
     pgp = pgp_delete_from_obj(obj, index);
@@ -1798,12 +1846,12 @@ out:
         return 1;
 }
 
-static NOINLINE int do_tmem_flush_object(pool_t *pool, uint64_t oid)
+static NOINLINE int do_tmem_flush_object(pool_t *pool, OID *oidp)
 {
     obj_t *obj;
 
     pool->flush_objs++;
-    obj = obj_find(pool,oid);
+    obj = obj_find(pool,oidp);
     if ( obj == NULL )
         goto out;
     tmem_write_lock(&pool->pool_rwlock);
@@ -1863,6 +1911,16 @@ static NOINLINE int do_tmem_new_pool(cli
     if ( pagebits != (PAGE_SHIFT - 12) )
     {
         printk("failed... unsupported pagesize %d\n",1<<(pagebits+12));
+        return -EPERM;
+    }
+    if ( flags & TMEM_POOL_PRECOMPRESSED )
+    {
+        printk("failed... precompression flag set but unsupported\n");
+        return -EPERM;
+    }
+    if ( flags & TMEM_POOL_RESERVED_BITS )
+    {
+        printk("failed... reserved bits must be zero\n");
         return -EPERM;
     }
     if ( (pool = pool_alloc()) == NULL )
@@ -2369,6 +2427,7 @@ static NOINLINE int tmemc_save_get_next_
     pool_t *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
                    ? NULL : client->pools[pool_id];
     pgp_t *pgp;
+    OID oid;
     int ret = 0;
     struct tmem_handle *h;
     unsigned int pagesize = 1 << (pool->pageshift+12);
@@ -2389,22 +2448,23 @@ static NOINLINE int tmemc_save_get_next_
     {
         /* process the first one */
         pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
-                         pgp_t,pool_pers_pages);
-    } else if ( list_is_last(&pool->cur_pgp->pool_pers_pages, 
+                         pgp_t,us.pool_pers_pages);
+    } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages, 
                              &pool->persistent_page_list) )
     {
         /* already processed the last one in the list */
         ret = -1;
         goto out;
     }
-    pgp = list_entry((&pool->cur_pgp->pool_pers_pages)->next,
-                         pgp_t,pool_pers_pages);
+    pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
+                         pgp_t,us.pool_pers_pages);
     pool->cur_pgp = pgp;
+    oid = pgp->us.obj->oid;
     h = (struct tmem_handle *)buf.p;
-    h->oid = pgp->obj->oid;
+    *(OID *)&h->oid[0] = oid;
     h->index = pgp->index;
     buf.p = (void *)(h+1);
-    ret = do_tmem_get(pool, h->oid, h->index,0,0,0,pagesize,buf.p);
+    ret = do_tmem_get(pool, &oid, h->index,0,0,0,pagesize,buf.p);
 
 out:
     tmem_spin_unlock(&pers_lists_spinlock);
@@ -2444,7 +2504,7 @@ static NOINLINE int tmemc_save_get_next_
     }
     h = (struct tmem_handle *)buf.p;
     h->pool_id = pgp->pool_id;
-    h->oid = pgp->inv_oid;
+    *(OID *)&h->oid = pgp->inv_oid;
     h->index = pgp->index;
     ret = 1;
 out:
@@ -2452,7 +2512,7 @@ out:
     return ret;
 }
 
-static int tmemc_restore_put_page(int cli_id, int pool_id, uint64_t oid,
+static int tmemc_restore_put_page(int cli_id, int pool_id, OID *oidp,
                       uint32_t index, tmem_cli_va_t buf, uint32_t bufsize)
 {
     client_t *client = tmh_client_from_cli_id(cli_id);
@@ -2461,10 +2521,10 @@ static int tmemc_restore_put_page(int cl
 
     if ( pool == NULL )
         return -1;
-    return do_tmem_put(pool,oid,index,0,0,0,bufsize,buf.p);
+    return do_tmem_put(pool,oidp,index,0,0,0,bufsize,buf.p);
 }
 
-static int tmemc_restore_flush_page(int cli_id, int pool_id, uint64_t oid,
+static int tmemc_restore_flush_page(int cli_id, int pool_id, OID *oidp,
                         uint32_t index)
 {
     client_t *client = tmh_client_from_cli_id(cli_id);
@@ -2473,7 +2533,7 @@ static int tmemc_restore_flush_page(int 
 
     if ( pool == NULL )
         return -1;
-    return do_tmem_flush_page(pool,oid,index);
+    return do_tmem_flush_page(pool,oidp,index);
 }
 
 static NOINLINE int do_tmem_control(struct tmem_op *op)
@@ -2481,6 +2541,7 @@ static NOINLINE int do_tmem_control(stru
     int ret;
     uint32_t pool_id = op->pool_id;
     uint32_t subop = op->u.ctrl.subop;
+    OID *oidp = (OID *)(&op->u.ctrl.oid[0]);
 
     if (!tmh_current_is_privileged())
     {
@@ -2533,12 +2594,12 @@ static NOINLINE int do_tmem_control(stru
         break;
     case TMEMC_RESTORE_PUT_PAGE:
         ret = tmemc_restore_put_page(op->u.ctrl.cli_id,pool_id,
-                                     op->u.ctrl.arg3, op->u.ctrl.arg2,
+                                     oidp, op->u.ctrl.arg2,
                                      op->u.ctrl.buf, op->u.ctrl.arg1);
         break;
     case TMEMC_RESTORE_FLUSH_PAGE:
         ret = tmemc_restore_flush_page(op->u.ctrl.cli_id,pool_id,
-                                       op->u.ctrl.arg3, op->u.ctrl.arg2);
+                                       oidp, op->u.ctrl.arg2);
         break;
     default:
         ret = -1;
@@ -2553,6 +2614,7 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
     struct tmem_op op;
     client_t *client = tmh_client_from_current();
     pool_t *pool = NULL;
+    OID *oidp;
     int rc = 0;
     bool_t succ_get = 0, succ_put = 0;
     bool_t non_succ_get = 0, non_succ_put = 0;
@@ -2656,6 +2718,7 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
         ASSERT_SENTINEL(pool,POOL);
     }
 
+    oidp = (OID *)&op.u.gen.oid[0];
     switch ( op.cmd )
     {
     case TMEM_NEW_POOL:
@@ -2664,28 +2727,28 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
         break;
     case TMEM_NEW_PAGE:
         tmem_ensure_avail_pages();
-        rc = do_tmem_put(pool, op.u.gen.object,
+        rc = do_tmem_put(pool, oidp,
                          op.u.gen.index, op.u.gen.cmfn, 0, 0, 0, NULL);
         break;
     case TMEM_PUT_PAGE:
         tmem_ensure_avail_pages();
-        rc = do_tmem_put(pool, op.u.gen.object,
+        rc = do_tmem_put(pool, oidp,
                     op.u.gen.index, op.u.gen.cmfn, 0, 0, PAGE_SIZE, NULL);
         if (rc == 1) succ_put = 1;
         else non_succ_put = 1;
         break;
     case TMEM_GET_PAGE:
-        rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
+        rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
                          0, 0, PAGE_SIZE, 0);
         if (rc == 1) succ_get = 1;
         else non_succ_get = 1;
         break;
     case TMEM_FLUSH_PAGE:
         flush = 1;
-        rc = do_tmem_flush_page(pool, op.u.gen.object, op.u.gen.index);
+        rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
         break;
     case TMEM_FLUSH_OBJECT:
-        rc = do_tmem_flush_object(pool, op.u.gen.object);
+        rc = do_tmem_flush_object(pool, oidp);
         flush_obj = 1;
         break;
     case TMEM_DESTROY_POOL:
@@ -2693,12 +2756,12 @@ EXPORT long do_tmem_op(tmem_cli_op_t uop
         rc = do_tmem_destroy_pool(op.pool_id);
         break;
     case TMEM_READ:
-        rc = do_tmem_get(pool, op.u.gen.object, op.u.gen.index, op.u.gen.cmfn,
+        rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
                          op.u.gen.tmem_offset, op.u.gen.pfn_offset,
                          op.u.gen.len,0);
         break;
     case TMEM_WRITE:
-        rc = do_tmem_put(pool, op.u.gen.object,
+        rc = do_tmem_put(pool, oidp,
                          op.u.gen.index, op.u.gen.cmfn,
                          op.u.gen.tmem_offset, op.u.gen.pfn_offset,
                          op.u.gen.len, NULL);
diff -r 07ac5459b250 xen/include/public/tmem.h
--- a/xen/include/public/tmem.h Wed Aug 25 09:23:31 2010 +0100
+++ b/xen/include/public/tmem.h Thu Sep 02 16:43:33 2010 -0600
@@ -28,6 +28,9 @@
 #define __XEN_PUBLIC_TMEM_H__
 
 #include "xen.h"
+
+/* version of ABI */
+#define TMEM_SPEC_VERSION          1
 
 /* Commands to HYPERVISOR_tmem_op() */
 #define TMEM_CONTROL               0
@@ -75,10 +78,12 @@
 /* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
 #define TMEM_POOL_PERSIST          1
 #define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PRECOMPRESSED    4
 #define TMEM_POOL_PAGESIZE_SHIFT   4
 #define TMEM_POOL_PAGESIZE_MASK  0xf
 #define TMEM_POOL_VERSION_SHIFT   24
 #define TMEM_POOL_VERSION_MASK  0xff
+#define TMEM_POOL_RESERVED_BITS  0x00ffff00
 
 /* Bits for client flags (save/restore) */
 #define TMEM_CLIENT_COMPRESS       1
@@ -106,12 +111,12 @@ struct tmem_op {
             uint32_t cli_id;
             uint32_t arg1;
             uint32_t arg2;
-            uint64_t arg3;
+            uint64_t oid[3];
             tmem_cli_va_t buf;
         } ctrl; /* for cmd == TMEM_CONTROL */
         struct {
             
-            uint64_t object;
+            uint64_t oid[3];
             uint32_t index;
             uint32_t tmem_offset;
             uint32_t pfn_offset;
@@ -126,9 +131,8 @@ struct tmem_handle {
 struct tmem_handle {
     uint32_t pool_id;
     uint32_t index;
-    uint64_t oid;
+    uint64_t oid[3];
 };
-
 #endif
 
 #endif /* __XEN_PUBLIC_TMEM_H__ */

Attachment: tmem-abi-v1-hyp.patch
Description: Binary data

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>