WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-ia64-devel

Re: [Xen-ia64-devel] [PATCH][RFC] per vcpu VHPT

To: xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
Subject: Re: [Xen-ia64-devel] [PATCH][RFC] per vcpu VHPT
From: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
Date: Mon, 24 Jul 2006 23:22:08 +0900
Delivery-date: Mon, 24 Jul 2006 07:29:47 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
In-reply-to: <20060724125428.GD31855%yamahata@xxxxxxxxxxxxx>
List-help: <mailto:xen-ia64-devel-request@lists.xensource.com?subject=help>
List-id: Discussion of the ia64 port of Xen <xen-ia64-devel.lists.xensource.com>
List-post: <mailto:xen-ia64-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ia64-devel>, <mailto:xen-ia64-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-ia64-devel>, <mailto:xen-ia64-devel-request@lists.xensource.com?subject=unsubscribe>
References: <20060724125428.GD31855%yamahata@xxxxxxxxxxxxx>
Sender: xen-ia64-devel-bounces@xxxxxxxxxxxxxxxxxxx
User-agent: Mutt/1.4.2.1i
I sent out the old patches. sorry for that.
attached the newest one. Please discard old ones.

On Mon, Jul 24, 2006 at 09:54:28PM +0900, Isaku Yamahata wrote:
> Hi.
> 
> I implemented per vcpu VHPT for non-VTi domain.
> The motivation is to alleviate vcpu migration cost between physical cpus
> with credit scheduler.
> If more than one vcpu of same domain, VHPT needs to be flushed every
> vcpu switch. I'd like to avoid this scenario.
> The patch is for discussion and performance evaluation. Not for commit. 
> 
> 
> I checked the mailing list archives and found the thread
> Xen/ia64 - global or per VP VHPT
> http://lists.xensource.com/archives/html/xen-devel/2005-04/msg01002.html
> 
> The conclustion at that time isn't concluded. 
> (At least my understanding. Because the thread was very long to follow. 
> So I might be wrong, correct me.)
> With this patch, we can measure the performance and descide to include
> this patch or discard the idea.
> 
> 
> This patch introduces compile time optoin,  xen_ia64_pervcpu_vhpt=y,
> to enable this feature and xen boot time option,  pervcpu_vhpt=0
> to disable per vcpu vhpt allocation.
> The patch depends on tlb tracking patch which I sent before.
> I attached these patches for convinience.
> 
> Thanks
> -- 
> yamahata

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID c654d462c4481685fb2e803e41cb2beba56bee4b
> # Parent  b2abc70be89e02d0d380674096c8c1fb9e552431
> import linux/include/linux/hash.h.
> PATCHNAME: import_linux_hash.h
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/README.origin
> --- a/xen/include/asm-ia64/linux/README.origin        Wed Jul 19 07:17:54 
> 2006 -0600
> +++ b/xen/include/asm-ia64/linux/README.origin        Mon Jul 24 21:34:37 
> 2006 +0900
> @@ -8,6 +8,7 @@ bitmap.h              -> linux/include/linux/bitmap.
>  bitmap.h             -> linux/include/linux/bitmap.h
>  bitops.h             -> linux/include/linux/bitops.h
>  initrd.h             -> linux/include/linux/initrd.h
> +hash.h                       -> linux/include/linux/hash.h
>  jiffies.h            -> linux/include/linux/jiffies.h
>  kmalloc_sizes.h              -> linux/include/linux/kmalloc_sizes.h
>  linkage.h            -> linux/include/linux/linkage.h
> diff -r b2abc70be89e -r c654d462c448 xen/include/asm-ia64/linux/hash.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/linux/hash.h       Mon Jul 24 21:34:37 2006 +0900
> @@ -0,0 +1,58 @@
> +#ifndef _LINUX_HASH_H
> +#define _LINUX_HASH_H
> +/* Fast hashing routine for a long.
> +   (C) 2002 William Lee Irwin III, IBM */
> +
> +/*
> + * Knuth recommends primes in approximately golden ratio to the maximum
> + * integer representable by a machine word for multiplicative hashing.
> + * Chuck Lever verified the effectiveness of this technique:
> + * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
> + *
> + * These primes are chosen to be bit-sparse, that is operations on
> + * them can use shifts and additions instead of multiplications for
> + * machines where multiplications are slow.
> + */
> +#if BITS_PER_LONG == 32
> +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e370001UL
> +#elif BITS_PER_LONG == 64
> +/*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
> +#define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
> +#else
> +#error Define GOLDEN_RATIO_PRIME for your wordsize.
> +#endif
> +
> +static inline unsigned long hash_long(unsigned long val, unsigned int bits)
> +{
> +     unsigned long hash = val;
> +
> +#if BITS_PER_LONG == 64
> +     /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
> +     unsigned long n = hash;
> +     n <<= 18;
> +     hash -= n;
> +     n <<= 33;
> +     hash -= n;
> +     n <<= 3;
> +     hash += n;
> +     n <<= 3;
> +     hash -= n;
> +     n <<= 4;
> +     hash += n;
> +     n <<= 2;
> +     hash += n;
> +#else
> +     /* On some cpus multiply is faster, on others gcc will do shifts */
> +     hash *= GOLDEN_RATIO_PRIME;
> +#endif
> +
> +     /* High bits are more random, so use them. */
> +     return hash >> (BITS_PER_LONG - bits);
> +}
> +     
> +static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
> +{
> +     return hash_long((unsigned long)ptr, bits);
> +}
> +#endif /* _LINUX_HASH_H */

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> # Parent  c654d462c4481685fb2e803e41cb2beba56bee4b
> add tlb insert tracking to do vTLB flush finer grained virtual address
> range when a page is unmapped from a domain.
> This is functionality is enabled with a compile time option,
> xen_ia64_tlb_track=y.
> PATCHNAME: tlb_track
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk  Mon Jul 24 21:35:16 2006 +0900
> @@ -39,6 +39,9 @@ ifeq ($(xen_ia64_dom0_virtual_physical),
>  ifeq ($(xen_ia64_dom0_virtual_physical),y)
>  CFLAGS       += -DCONFIG_XEN_IA64_DOM0_VP
>  endif
> +ifeq ($(xen_ia64_tlb_track),y)
> +CFLAGS       += -DCONFIG_XEN_IA64_TLB_TRACK
> +endif
>  ifeq ($(no_warns),y)
>  CFLAGS       += -Wa,--fatal-warnings -Werror -Wno-uninitialized
>  endif
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/Makefile
> --- a/xen/arch/ia64/xen/Makefile      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/Makefile      Mon Jul 24 21:35:16 2006 +0900
> @@ -27,3 +27,4 @@ obj-y += privop_stat.o
>  obj-y += privop_stat.o
>  
>  obj-$(crash_debug) += gdbstub.o
> +obj-$(xen_ia64_tlb_track) += tlb_track.o
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:35:16 2006 +0900
> @@ -60,6 +60,9 @@
>  #include <asm/regionreg.h>
>  #include <asm/dom_fw.h>
>  #include <asm/privop_stat.h>
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +#endif
>  
>  #ifndef CONFIG_XEN_IA64_DOM0_VP
>  #define CONFIG_DOMAIN0_CONTIGUOUS
> @@ -351,6 +354,10 @@ int arch_domain_create(struct domain *d)
>       if (is_idle_domain(d))
>           return 0;
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     if (tlb_track_create(d) < 0)
> +       goto fail_nomem;
> +#endif
>       d->shared_info = alloc_xenheap_pages(get_order_from_shift(XSI_SHIFT));
>       if (d->shared_info == NULL)
>           goto fail_nomem;
> @@ -389,6 +396,9 @@ void arch_domain_destroy(struct domain *
>       if (d->shared_info != NULL)
>           free_xenheap_pages(d->shared_info, get_order_from_shift(XSI_SHIFT));
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     tlb_track_destroy(d);
> +#endif
>       domain_flush_destroy (d);
>  
>       deallocate_rid_range(d);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/faults.c
> --- a/xen/arch/ia64/xen/faults.c      Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/faults.c      Mon Jul 24 21:35:16 2006 +0900
> @@ -27,6 +27,7 @@
>  #include <asm/debugger.h>
>  #include <asm/fpswa.h>
>  #include <asm/bundle.h>
> +#include <asm/p2m_entry.h>
>  #include <asm/privop_stat.h>
>  #include <asm/asm-xsi-offsets.h>
>  
> @@ -202,8 +203,15 @@ void ia64_do_page_fault (unsigned long a
>       fault = vcpu_translate(current,address,is_data,&pteval,&itir,&iha);
>       if (fault == IA64_NO_FAULT || fault == IA64_USE_TLB) {
>               struct p2m_entry entry;
> -             pteval = translate_domain_pte(pteval, address, itir, &logps, 
> &entry);
> -             vcpu_itc_no_srlz(current,is_data?2:1,address,pteval,-1UL,logps);
> +             unsigned long m_pteval;
> +             m_pteval = translate_domain_pte(pteval, address, itir, &logps, 
> &entry);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
> +             vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4, 
> +                              address, m_pteval, pteval, logps);
> +#else
> +             vcpu_itc_no_srlz(current, (is_data? 2: 1) | 4, 
> +                              address, m_pteval, pteval, logps, &entry);
> +#endif
>               if ((fault == IA64_USE_TLB && !current->arch.dtlb.pte.p) ||
>                   p2m_entry_retry(&entry)) {
>                       /* dtlb has been purged in-between.  This dtlb was
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/mm.c
> --- a/xen/arch/ia64/xen/mm.c  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/mm.c  Mon Jul 24 21:35:16 2006 +0900
> @@ -170,13 +170,14 @@
>  #include <asm/pgalloc.h>
>  #include <asm/vhpt.h>
>  #include <asm/vcpu.h>
> +#include <asm/p2m_entry.h>
>  #include <linux/efi.h>
>  
>  #ifndef CONFIG_XEN_IA64_DOM0_VP
>  #define CONFIG_DOMAIN0_CONTIGUOUS
>  #else
> -static void domain_page_flush(struct domain* d, unsigned long mpaddr,
> -                              unsigned long old_mfn, unsigned long new_mfn);
> +static void domain_page_flush(struct domain* d,
> +                              volatile pte_t* ptep, pte_t old_pte);
>  #endif
>  
>  static struct domain *dom_xen, *dom_io;
> @@ -718,6 +719,19 @@ void *domain_mpa_to_imva(struct domain *
>  }
>  #endif
>  
> +static unsigned long
> +assign_flags_to_pteflags(unsigned long flags)
> +{
> +    unsigned long pteflags =
> +        (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    if (flags & ASSIGN_tlb_track) {
> +        pteflags |= _PAGE_TLB_TRACKING;
> +    }
> +#endif
> +    return pteflags;
> +}
> +
>  /* Allocate a new page for domain and map it to the specified metaphysical
>     address.  */
>  static struct page_info *
> @@ -811,7 +825,7 @@ assign_new_domain0_page(struct domain *d
>  }
>  
>  /* map a physical address to the specified metaphysical addr */
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx 
>  // This is called by assign_domain_mmio_page().
>  // So accessing to pte is racy.
>  void
> @@ -823,13 +837,13 @@ __assign_domain_page(struct domain *d,
>      pte_t old_pte;
>      pte_t new_pte;
>      pte_t ret_pte;
> -    unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: 
> _PAGE_AR_RWX;
> +    unsigned long pteflags = assign_flags_to_pteflags(flags);
>  
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>      old_pte = __pte(0);
>      new_pte = pfn_pte(physaddr >> PAGE_SHIFT,
> -                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> +                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
>      ret_pte = ptep_cmpxchg_rel(&d->arch.mm, mpaddr, pte, old_pte, new_pte);
>      if (pte_val(ret_pte) == pte_val(old_pte))
>          smp_mb();
> @@ -945,7 +959,7 @@ assign_domain_mach_page(struct domain *d
>  // caller must call set_gpfn_from_mfn() before call if necessary.
>  // because set_gpfn_from_mfn() result must be visible before pte xchg
>  // caller must use memory barrier. NOTE: xchg has acquire semantics.
> -// flags: currently only ASSIGN_readonly
> +// flags: ASSIGN_xxx
>  static void
>  assign_domain_page_replace(struct domain *d, unsigned long mpaddr,
>                             unsigned long mfn, unsigned long flags)
> @@ -954,11 +968,11 @@ assign_domain_page_replace(struct domain
>      volatile pte_t* pte;
>      pte_t old_pte;
>      pte_t npte;
> -    unsigned long arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: 
> _PAGE_AR_RWX;
> +    unsigned long pteflags = assign_flags_to_pteflags(flags);
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>      // update pte
> -    npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | arflags));
> +    npte = pfn_pte(mfn, __pgprot(__DIRTY_BITS | _PAGE_PL_2 | pteflags));
>      old_pte = ptep_xchg(mm, mpaddr, pte, npte);
>      if (pte_mem(old_pte)) {
>          unsigned long old_mfn = pte_pfn(old_pte);
> @@ -978,7 +992,7 @@ assign_domain_page_replace(struct domain
>                  set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
>              }
>  
> -            domain_page_flush(d, mpaddr, old_mfn, mfn);
> +            domain_page_flush(d, pte, old_pte);
>  
>              try_to_clear_PGC_allocate(d, old_page);
>              put_page(old_page);
> @@ -997,29 +1011,29 @@ assign_domain_page_cmpxchg_rel(struct do
>      struct mm_struct *mm = &d->arch.mm;
>      volatile pte_t* pte;
>      unsigned long old_mfn;
> -    unsigned long old_arflags;
> +    unsigned long old_pteflags;
>      pte_t old_pte;
>      unsigned long new_mfn;
> -    unsigned long new_arflags;
> +    unsigned long new_pteflags;
>      pte_t new_pte;
>      pte_t ret_pte;
>  
>      pte = lookup_alloc_domain_pte(d, mpaddr);
>  
>   again:
> -    old_arflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
> +    old_pteflags = pte_val(*pte) & ~_PAGE_PPN_MASK;
>      old_mfn = page_to_mfn(old_page);
> -    old_pte = pfn_pte(old_mfn, __pgprot(old_arflags));
> +    old_pte = pfn_pte(old_mfn, __pgprot(old_pteflags));
>      if (!pte_present(old_pte)) {
> -        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx\n",
> -                __func__, pte_val(old_pte), old_arflags, old_mfn);
> +        DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx\n",
> +                __func__, pte_val(old_pte), old_pteflags, old_mfn);
>          return -EINVAL;
>      }
>  
> -    new_arflags = (flags & ASSIGN_readonly)? _PAGE_AR_R: _PAGE_AR_RWX;
> +    new_pteflags = assign_flags_to_pteflags(flags);
>      new_mfn = page_to_mfn(new_page);
>      new_pte = pfn_pte(new_mfn,
> -                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_arflags));
> +                      __pgprot(__DIRTY_BITS | _PAGE_PL_2 | new_pteflags));
>  
>      // update pte
>      ret_pte = ptep_cmpxchg_rel(mm, mpaddr, pte, old_pte, new_pte);
> @@ -1028,10 +1042,10 @@ assign_domain_page_cmpxchg_rel(struct do
>              goto again;
>          }
>  
> -        DPRINTK("%s: old_pte 0x%lx old_arflags 0x%lx old_mfn 0x%lx "
> +        DPRINTK("%s: old_pte 0x%lx old_pteflags 0x%lx old_mfn 0x%lx "
>                  "ret_pte 0x%lx ret_mfn 0x%lx\n",
>                  __func__,
> -                pte_val(old_pte), old_arflags, old_mfn,
> +                pte_val(old_pte), old_pteflags, old_mfn,
>                  pte_val(ret_pte), pte_pfn(ret_pte));
>          return -EINVAL;
>      }
> @@ -1043,7 +1057,7 @@ assign_domain_page_cmpxchg_rel(struct do
>  
>      set_gpfn_from_mfn(old_mfn, INVALID_M2P_ENTRY);
>  
> -    domain_page_flush(d, mpaddr, old_mfn, new_mfn);
> +    domain_page_flush(d, pte, old_pte);
>      put_page(old_page);
>      return 0;
>  }
> @@ -1111,7 +1125,7 @@ zap_domain_page_one(struct domain *d, un
>          set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
>      }
>  
> -    domain_page_flush(d, mpaddr, mfn, INVALID_MFN);
> +    domain_page_flush(d, pte, old_pte);
>  
>      if (page_get_owner(page) != NULL) {
>          try_to_clear_PGC_allocate(d, page);
> @@ -1199,8 +1213,12 @@ create_grant_host_mapping(unsigned long 
>      BUG_ON(ret == 0);
>      BUG_ON(page_get_owner(mfn_to_page(mfn)) == d &&
>             get_gpfn_from_mfn(mfn) != INVALID_M2P_ENTRY);
> -    assign_domain_page_replace(d, gpaddr, mfn, (flags & GNTMAP_readonly)?
> -                                              ASSIGN_readonly: 
> ASSIGN_writable);
> +    assign_domain_page_replace(d, gpaddr, mfn,
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +                               ASSIGN_tlb_track |
> +#endif
> +                               ((flags & GNTMAP_readonly) ?
> +                                ASSIGN_readonly: ASSIGN_writable));
>      return GNTST_okay;
>  }
>  
> @@ -1254,7 +1272,7 @@ destroy_grant_host_mapping(unsigned long
>      }
>      BUG_ON(pte_pfn(old_pte) != mfn);
>  
> -    domain_page_flush(d, gpaddr, mfn, INVALID_MFN);
> +    domain_page_flush(d, pte, old_pte);
>  
>      page = mfn_to_page(mfn);
>      BUG_ON(page_get_owner(page) == d);//try_to_clear_PGC_allocate(d, page) 
> is not needed.
> @@ -1418,11 +1436,38 @@ guest_physmap_remove_page(struct domain 
>  
>  //XXX sledgehammer.
>  //    flush finer range.
> -void
> -domain_page_flush(struct domain* d, unsigned long mpaddr,
> -                  unsigned long old_mfn, unsigned long new_mfn)
> -{
> +static void
> +domain_page_flush(struct domain* d, volatile pte_t* ptep, pte_t old_pte)
> +{
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>      domain_flush_vtlb_all();
> +#else
> +    struct tlb_track_entry* entry;
> +    switch (tlb_track_search_and_remove(d->arch.tlb_track,
> +                                        ptep, old_pte, &entry)) {
> +    case TLB_TRACK_NOT_TRACKED:
> +        //DPRINTK("%s TLB_TRACK_NOT_TRACKED\n", __func__);
> +        domain_flush_vtlb_all();
> +        break;
> +    case TLB_TRACK_NOT_FOUND:
> +        // do nothing
> +        //DPRINTK("%s TLB_TRACK_NOT_FOUND\n", __func__);
> +        break;
> +    case TLB_TRACK_FOUND:
> +        //DPRINTK("%s TLB_TRACK_FOUND\n", __func__);
> +        domain_flush_vltb_track_entry(d, entry);
> +        tlb_track_free_entry(d->arch.tlb_track, entry);
> +        break;
> +    case TLB_TRACK_MANY:
> +        DPRINTK("%s TLB_TRACK_MANY\n", __func__);
> +        domain_flush_vtlb_all();
> +        break;
> +    case TLB_TRACK_AGAIN:
> +        DPRINTK("%s TLB_TRACK_AGAIN\n", __func__);
> +        BUG();
> +        break;
> +    }
> +#endif
>  }
>  
>  int
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vcpu.c
> --- a/xen/arch/ia64/xen/vcpu.c        Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vcpu.c        Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,7 @@
>  #include <asm/vmx_phy_mode.h>
>  #include <asm/bundle.h>
>  #include <asm/privop_stat.h>
> +#include <asm/p2m_entry.h>
>  
>  /* FIXME: where these declarations should be there ? */
>  extern void getreg(unsigned long regnum, unsigned long *val, int *nat, 
> struct pt_regs *regs);
> @@ -2003,7 +2004,11 @@ IA64FAULT vcpu_set_dtr(VCPU *vcpu, u64 s
>   VCPU translation cache access routines
>  **************************************************************************/
>  
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>  void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte, 
> UINT64 mp_pte, UINT64 logps)
> +#else
> +void vcpu_itc_no_srlz(VCPU *vcpu, UINT64 IorD, UINT64 vaddr, UINT64 pte, 
> UINT64 mp_pte, UINT64 logps, struct p2m_entry* entry)
> +#endif
>  {
>       unsigned long psr;
>       unsigned long ps = (vcpu->domain==dom0) ? logps : PAGE_SHIFT;
> @@ -2017,6 +2022,9 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
>  
>  #ifdef CONFIG_XEN_IA64_DOM0_VP
>       BUG_ON(logps > PAGE_SHIFT);
> +#endif
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +     vcpu_tlb_track_insert_or_dirty(vcpu, vaddr, entry);
>  #endif
>       psr = ia64_clear_ic();
>       ia64_itc(IorD,vaddr,pte,ps); // FIXME: look for bigger mappings
> @@ -2035,7 +2043,7 @@ void vcpu_itc_no_srlz(VCPU *vcpu, UINT64
>       // PAGE_SIZE mapping in the vhpt for now, else purging is complicated
>       else vhpt_insert(vaddr,pte,PAGE_SHIFT<<2);
>  #endif
> -     if ((mp_pte == -1UL) || (IorD & 0x4)) // don't place in 1-entry TLB
> +     if (IorD & 0x4) // don't place in 1-entry TLB
>               return;
>       if (IorD & 0x1) {
>               vcpu_set_tr_entry(&PSCBX(vcpu,itlb),mp_pte,ps<<2,vaddr);
> @@ -2060,7 +2068,11 @@ again:
>       pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
>       if (!pteval) return IA64_ILLOP_FAULT;
>       if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>       vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps);
> +#else
> +     vcpu_itc_no_srlz(vcpu,2,ifa,pteval,pte,logps,&entry);
> +#endif
>       if (swap_rr0) set_metaphysical_rr0();
>       if (p2m_entry_retry(&entry)) {
>               vcpu_flush_tlb_vhpt_range(ifa, logps);
> @@ -2083,7 +2095,11 @@ again:
>       pteval = translate_domain_pte(pte, ifa, itir, &logps, &entry);
>       if (!pteval) return IA64_ILLOP_FAULT;
>       if (swap_rr0) set_one_rr(0x0,PSCB(vcpu,rrs[0]));
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>       vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps);
> +#else
> +     vcpu_itc_no_srlz(vcpu, 1,ifa,pteval,pte,logps,&entry);
> +#endif
>       if (swap_rr0) set_metaphysical_rr0();
>       if (p2m_entry_retry(&entry)) {
>               vcpu_flush_tlb_vhpt_range(ifa, logps);
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:35:16 2006 +0900
> @@ -227,6 +227,48 @@ void domain_flush_vtlb_range (struct dom
>       ia64_global_tlb_purge(vadr,vadr+addr_range,PAGE_SHIFT);
>  }
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +#include <asm/tlb_track.h>
> +void
> +domain_flush_vltb_track_entry(struct domain* d,
> +                              const struct tlb_track_entry* entry)
> +{
> +     unsigned long old_rid;
> +     struct vcpu* v;
> +     int cpu;
> +
> +     //tlb_track_entry_printf(entry);
> +     vcpu_get_rr(current, 0, &old_rid);
> +     vcpu_set_rr(current, 0, entry->rid);
> +    
> +     for_each_vcpu(d, v) {
> +             if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> +                     continue;
> +             if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> +                     continue;
> +
> +             /* Purge TC entries.
> +                FIXME: clear only if match.  */
> +             vcpu_purge_tr_entry(&PSCBX(v, dtlb));
> +             vcpu_purge_tr_entry(&PSCBX(v, itlb));
> +     }
> +     smp_mb();
> +
> +     for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> +             //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
> +             /* Invalidate VHPT entries.  */
> +             cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> +     }
> +     // ptc.ga has release semantics.
> +
> +     /* ptc.ga  */
> +     ia64_global_tlb_purge(entry->vaddr, entry->vaddr + PAGE_SIZE,
> +                           PAGE_SHIFT);
> +
> +     vcpu_set_rr(current, 0, old_rid);
> +}
> +#endif
> +
>  static void flush_tlb_vhpt_all (struct domain *d)
>  {
>       /* First VHPT.  */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h   Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h   Mon Jul 24 21:35:16 2006 +0900
> @@ -12,28 +12,10 @@
>  #include <xen/cpumask.h>
>  #include <asm/fpswa.h>
>  
> -struct p2m_entry {
> -    volatile pte_t*     pte;
> -    pte_t               used;
> -};
> -
> -static inline void
> -p2m_entry_set(struct p2m_entry* entry, volatile pte_t* pte, pte_t used)
> -{
> -    entry->pte  = pte;
> -    entry->used = used;
> -}
> -
> -static inline int
> -p2m_entry_retry(struct p2m_entry* entry)
> -{
> -    //XXX see lookup_domain_pte().
> -    //    NULL is set for invalid gpaddr for the time being.
> -    if (entry->pte == NULL)
> -        return 0;
> -
> -    return (pte_val(*entry->pte) != pte_val(entry->used));
> -}
> +struct p2m_entry;
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track;
> +#endif
>  
>  extern void domain_relinquish_resources(struct domain *);
>  
> @@ -118,6 +100,10 @@ struct arch_domain {
>      void *fpswa_inf;
>  
>      struct last_vcpu last_vcpu[NR_CPUS];
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    struct tlb_track*   tlb_track;
> +#endif
>  };
>  #define INT_ENABLE_OFFSET(v)                   \
>      (sizeof(vcpu_info_t) * (v)->vcpu_id + \
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlbflush.h
> --- a/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/tlbflush.h Mon Jul 24 21:35:16 2006 +0900
> @@ -22,6 +22,13 @@ void domain_flush_vtlb_all (void);
>  /* Global range-flush of vTLB.  */
>  void domain_flush_vtlb_range (struct domain *d, u64 vadr, u64 addr_range);
>  
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +struct tlb_track_entry;
> +/* Global entry-flush of vTLB */
> +void domain_flush_vltb_track_entry(struct domain* d,
> +                                const struct tlb_track_entry* entry);
> +#endif
> +
>  /* Final vTLB flush on every dirty cpus.  */
>  void domain_flush_destroy (struct domain *d);
>  
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/vcpu.h
> --- a/xen/include/asm-ia64/vcpu.h     Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/asm-ia64/vcpu.h     Mon Jul 24 21:35:16 2006 +0900
> @@ -158,7 +158,12 @@ extern void vcpu_set_next_timer(VCPU *vc
>  extern void vcpu_set_next_timer(VCPU *vcpu);
>  extern BOOLEAN vcpu_timer_expired(VCPU *vcpu);
>  extern UINT64 vcpu_deliverable_interrupts(VCPU *vcpu);
> +#ifndef CONFIG_XEN_IA64_TLB_TRACK
>  extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
> UINT64);
> +#else
> +struct p2m_entry;
> +extern void vcpu_itc_no_srlz(VCPU *vcpu, UINT64, UINT64, UINT64, UINT64, 
> UINT64, struct p2m_entry*);
> +#endif
>  extern UINT64 vcpu_get_tmp(VCPU *, UINT64);
>  extern void vcpu_set_tmp(VCPU *, UINT64, UINT64);
>  
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/public/arch-ia64.h
> --- a/xen/include/public/arch-ia64.h  Mon Jul 24 21:34:37 2006 +0900
> +++ b/xen/include/public/arch-ia64.h  Mon Jul 24 21:35:16 2006 +0900
> @@ -357,8 +357,14 @@ DEFINE_XEN_GUEST_HANDLE(vcpu_guest_conte
>                                                  // address space.
>  // flags for page assignement to pseudo physical address space
>  #define _ASSIGN_readonly                0
> +#define _ASSIGN_tlb_track               1
> +
>  #define ASSIGN_readonly                 (1UL << _ASSIGN_readonly)
>  #define ASSIGN_writable                 (0UL << _ASSIGN_readonly) // dummy 
> flag
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +# define ASSIGN_tlb_track               (1UL << _ASSIGN_tlb_track)
> +#endif
> +
>  
>  /* This structure has the same layout of struct ia64_boot_param, defined in
>     <asm/system.h>.  It is redefined here to ease use.  */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/arch/ia64/xen/tlb_track.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/ia64/xen/tlb_track.c   Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,558 @@
> +/******************************************************************************
> + * tlb_track.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#include <asm/tlb_track.h>
> +#include <asm/p2m_entry.h>
> +#include <asm/vmx_mm_def.h> // for IA64_RR_SHIFT
> +#include <asm/vcpu.h> // for PSCB()
> +
> +#define CONFIG_TLB_TRACK_DEBUG
> +#ifdef CONFIG_TLB_TRACK_DEBUG
> +# define tlb_track_printd(fmt, ...)     \
> +    printf("%s:%d " fmt, __func__, __LINE__, ##__VA_ARGS__)
> +#else
> +# define tlb_track_printd(fmt, ...)     do { } while (0)
> +#endif
> +
> +#define CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +#include <asm/regs.h>
> +#include <xen/keyhandler.h>
> +
> +static void
> +dump_tlb_track_stat(unsigned char key)
> +{
> +    tlb_track_stat_printf(&dom0->arch.tlb_track->stat);
> +}
> +#endif
> +
> +static int
> +tlb_track_allocate_entries(struct tlb_track* tlb_track)
> +{
> +    struct page_info* entry_page;
> +    struct tlb_track_entry* track_entries;
> +    unsigned int allocated;
> +    unsigned long i;
> +
> +    BUG_ON(tlb_track->num_free > 0);
> +    if (tlb_track->num_entries >= tlb_track->limit) {
> +        DPRINTK("%s: num_entries %d limit %d\n",
> +                __func__, tlb_track->num_entries, tlb_track->limit);
> +        return -ENOMEM;
> +    }
> +    entry_page = alloc_domheap_page(NULL);
> +    if (entry_page == NULL) {
> +        DPRINTK("%s: domheap page failed. num_entries %d limit %d\n",
> +                __func__, tlb_track->num_entries, tlb_track->limit);
> +        return -ENOMEM;
> +    }
> +
> +    list_add(&entry_page->list, &tlb_track->page_list);
> +    track_entries = (struct tlb_track_entry*)page_to_virt(entry_page);
> +    allocated = PAGE_SIZE / sizeof(track_entries[0]);
> +    tlb_track->num_entries += allocated;
> +    tlb_track->num_free += allocated;
> +    for (i = 0; i < allocated; i++) {
> +        list_add(&track_entries[i].list, &tlb_track->free_list);
> +        //tlb_track_printd("track_entries[%ld] 0x%p\n", i, 
> &track_entries[i]);
> +    }
> +    tlb_track_printd("allocated %d num_entries %d num_free %d\n",
> +                     allocated, tlb_track->num_entries, tlb_track->num_free);
> +    return 0;
> +}
> +
> +
> +int
> +tlb_track_create(struct domain* d)
> +{
> +    struct tlb_track* tlb_track = NULL;
> +    struct page_info* hash_page = NULL;
> +    unsigned int hash_size;
> +    unsigned int hash_shift;
> +    unsigned int i;
> +
> +    tlb_track = xmalloc(struct tlb_track);
> +    if (tlb_track == NULL) {
> +        goto out;
> +    }
> +    hash_page = alloc_domheap_page(NULL);
> +    if (hash_page == NULL) {
> +        goto out;
> +    }
> +
> +    spin_lock_init(&tlb_track->free_list_lock);
> +    INIT_LIST_HEAD(&tlb_track->free_list);
> +    tlb_track->limit = TLB_TRACK_LIMIT_ENTRIES;
> +    tlb_track->num_entries = 0;
> +    tlb_track->num_free = 0;
> +    INIT_LIST_HEAD(&tlb_track->page_list);
> +    if (tlb_track_allocate_entries(tlb_track) < 0) {
> +        goto out;
> +    }
> +
> +    spin_lock_init(&tlb_track->hash_lock);
> +    //XXX hash size optimization
> +    hash_size = PAGE_SIZE / sizeof(tlb_track->hash[0]);
> +    for (hash_shift = 0; (1 << (hash_shift + 1)) < hash_size; hash_shift++)
> +        /* nothing */;
> +    tlb_track->hash_size = (1 << hash_shift);
> +    tlb_track->hash_shift = hash_shift;
> +    tlb_track->hash_mask = (1 << hash_shift) - 1;
> +    tlb_track->hash = page_to_virt(hash_page);
> +    for (i = 0; i < tlb_track->hash_size; i++) {
> +        INIT_LIST_HEAD(&tlb_track->hash[i]);
> +    }
> +
> +    memset(&tlb_track->stat, 0, sizeof(tlb_track->stat));
> +
> +    smp_mb(); // make initialization visible before use.
> +    d->arch.tlb_track = tlb_track;
> +    printk("%s:%d hash 0x%p hash_size %d \n",
> +           __func__, __LINE__, tlb_track->hash, tlb_track->hash_size);
> +
> +#ifdef CONFIG_TLB_TRACK_STAT_KEY_HANDLER
> +    register_keyhandler(
> +                's', dump_tlb_track_stat, "dump dom0 tlb track stats");
> +#endif
> +    return 0;
> +
> +out:
> +    if (hash_page != NULL) {
> +        free_domheap_page(hash_page);
> +    }
> +    if (tlb_track != NULL) {
> +        xfree(tlb_track);
> +    }
> +    return -ENOMEM;
> +}
> +
> +void
> +tlb_track_destroy(struct domain* d)
> +{
> +    struct tlb_track* tlb_track = d->arch.tlb_track;
> +    struct page_info* page;
> +    struct page_info* next;
> +
> +    spin_lock(&tlb_track->free_list_lock);
> +    BUG_ON(tlb_track->num_free != tlb_track->num_entries);
> +
> +    list_for_each_entry_safe(page, next, &tlb_track->page_list, list) {
> +        list_del(&page->list);
> +        free_domheap_page(page);
> +    }
> +
> +    free_domheap_page(virt_to_page(tlb_track->hash));
> +    xfree(tlb_track);
> +    //d->tlb_track = NULL;
> +}
> +
> +static struct tlb_track_entry*
> +tlb_track_get_entry(struct tlb_track* tlb_track)
> +{
> +    struct tlb_track_entry* entry = NULL;
> +    spin_lock(&tlb_track->free_list_lock);
> +    if (tlb_track->num_free == 0) {
> +        (void)tlb_track_allocate_entries(tlb_track);
> +    }
> +    if (tlb_track->num_free > 0) {
> +        BUG_ON(list_empty(&tlb_track->free_list));
> +        entry = list_entry(tlb_track->free_list.next,
> +                           struct tlb_track_entry, list);
> +        tlb_track->num_free--;
> +        list_del(&entry->list);
> +    }
> +    spin_unlock(&tlb_track->free_list_lock);
> +    return entry;
> +}
> +
> +void
> +tlb_track_free_entry(struct tlb_track* tlb_track,
> +                     struct tlb_track_entry* entry)
> +{
> +    spin_lock(&tlb_track->free_list_lock);
> +    list_add(&entry->list, &tlb_track->free_list);
> +    tlb_track->num_free++;
> +    spin_unlock(&tlb_track->free_list_lock);
> +}
> +
> +
> +#include <linux/hash.h>
> +// XXX hash function.
> +static struct list_head*
> +tlb_track_hash_head(struct tlb_track* tlb_track, volatile pte_t* ptep)
> +{
> +    unsigned long hash = hash_long((unsigned long)ptep, 
> tlb_track->hash_shift);
> +    BUG_ON(hash >= tlb_track->hash_size);
> +    BUG_ON((hash & tlb_track->hash_mask) != hash);
> +    return &tlb_track->hash[hash];
> +}
> +
> +static int
> +tlb_track_pte_zapped(pte_t old_pte, pte_t ret_pte)
> +{
> +    if (pte_pfn(old_pte) != pte_pfn(ret_pte) ||
> +        (pte_val(old_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK)) !=
> +        (pte_val(ret_pte) & ~(_PFN_MASK | _PAGE_TLB_TRACK_MASK))) {
> +        // Other thread zapped the p2m entry.
> +        return 1;
> +    }
> +    return 0;
> +}
> +
> +static TLB_TRACK_RET_T
> +tlb_track_insert_or_dirty(struct tlb_track* tlb_track, struct mm_struct* mm,
> +                          volatile pte_t* ptep, pte_t old_pte,
> +                          unsigned long vaddr, unsigned long rid)
> +{
> +    unsigned long mfn = pte_pfn(old_pte);
> +    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> +    struct tlb_track_entry* entry;
> +    struct tlb_track_entry* new_entry = NULL;
> +    unsigned long bit_to_be_set = _PAGE_TLB_INSERTED;
> +    pte_t new_pte;
> +    pte_t ret_pte;
> +
> +    struct vcpu* v = current;
> +    TLB_TRACK_RET_T ret = TLB_TRACK_NOT_FOUND;
> +
> +    tlb_track->stat.iod++;
> +    if (!pte_tlb_tracking(old_pte)) {
> +        tlb_track->stat.iod_not_tracked++;
> +        return TLB_TRACK_NOT_TRACKED;
> +    }
> +    if (pte_tlb_inserted_many(old_pte)) {
> +        tlb_track->stat.iod_tracked_many++;
> +        return TLB_TRACK_MANY;
> +    }
> +
> +    // vaddr must be normalized so that it is in rr0 and page aligned.
> +    BUG_ON((vaddr >> IA64_RR_SHIFT) != 0);
> +    BUG_ON((vaddr & ~PAGE_MASK) != 0);
> +#if 0
> +    tlb_track_printd("\n"
> +                     "\tmfn 0x%016lx\n"
> +                     "\told_pte 0x%016lx ptep 0x%p\n"
> +                     "\tptep_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> +                     "\ttlb_track 0x%p head 0x%p\n",
> +                     mfn,
> +                     pte_val(old_pte), ptep, pte_val(*ptep),
> +                     vaddr, rid,
> +                     tlb_track, head);
> +#endif
> +
> + again:
> +    // zapping side may zap the p2m entry and then remove tlb track entry
> +    // non-atomically. We may see the stale tlb track entry here.
> +    // p2m_entry_retry() handles such a case.
> +    // Or other thread may zap the p2m entry and remove tlb track entry
> +    // and inserted new tlb track entry.
> +    spin_lock(&tlb_track->hash_lock);
> +    list_for_each_entry(entry, head, list) {
> +        if (entry->ptep != ptep) {
> +            continue;
> +        }
> +
> +        if (pte_pfn(entry->pte_val) == mfn) {
> +            //tlb_track_entry_printf(entry);
> +            if (entry->vaddr == vaddr && entry->rid == rid) {
> +                //tlb_track_printd("TLB_TRACK_FOUND\n");
> +                ret = TLB_TRACK_FOUND;
> +                tlb_track->stat.iod_found++;
> +#ifdef CONFIG_TLB_TRACK_CNT
> +                entry->cnt++;
> +                if (entry->cnt > TLB_TRACK_CNT_FORCE_MANY) {
> +                    // heuristics:
> +                    // If a page is used to transfer data by dev channel,
> +                    // it would be unmapped with small amount access
> +                    // (once or twice tlb insert) after real device
> +                    // I/O completion. It would be short period.
> +                    // However this page seems to be accessed many times.
> +                    // We guess that this page is used I/O ring
> +                    // so that tracking this entry might be useless.
> +                    //tlb_track_entry_printf(entry);
> +                    //tlb_track_printd("cnt = %ld\n", entry->cnt);
> +                    tlb_track->stat.iod_force_many++;
> +                    goto force_many;
> +                }
> +#endif
> +                goto found;
> +            } else {
> +#ifdef CONFIG_TLB_TRACK_CNT
> +            force_many:
> +#endif
> +                if (!pte_tlb_inserted(old_pte)) {
> +                    printk("%s:%d racy update\n", __func__, __LINE__);
> +                    old_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED);
> +                }
> +                new_pte = __pte(pte_val(old_pte) | _PAGE_TLB_INSERTED_MANY);
> +                ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, 
> new_pte);
> +                if (pte_val(ret_pte) != pte_val(old_pte)) {
> +                    //tlb_track_printd("TLB_TRACK_AGAIN\n");
> +                    ret = TLB_TRACK_AGAIN;
> +                    tlb_track->stat.iod_again++;
> +                } else {
> +                    //tlb_track_printd("TLB_TRACK_MANY del entry 0x%p\n", 
> entry);
> +                    ret = TLB_TRACK_MANY;
> +                    list_del(&entry->list);
> +                    //tlb_track_entry_printf(entry);
> +                    tlb_track->stat.iod_tracked_many_del++;
> +                }
> +                goto out;
> +            }
> +        }
> +
> +        // Other thread changed the p2m entry and removed and inserted new
> +        // tlb tracn entry after we get old_pte, but before we get
> +        // spinlock.
> +        //tlb_track_printd("TLB_TRACK_AGAIN\n");
> +        ret = TLB_TRACK_AGAIN;
> +        tlb_track->stat.iod_again++;
> +        goto out;
> +    }
> +
> +    entry = NULL; // prevent freeing entry.
> +    if (pte_tlb_inserted(old_pte)) {
> +        // Other thread else removed the tlb_track_entry after we got old_pte
> +        // before we got spin lock.
> +        ret = TLB_TRACK_AGAIN;
> +        tlb_track->stat.iod_again++;
> +        goto out;
> +    }
> +    if (new_entry == NULL && bit_to_be_set == _PAGE_TLB_INSERTED) {
> +        spin_unlock(&tlb_track->hash_lock);
> +        new_entry = tlb_track_get_entry(tlb_track);
> +        if (new_entry == NULL) {
> +            tlb_track_printd("get_entry failed\n");
> +            // entry can't be allocated.
> +            // fall down into full flush mode.
> +            bit_to_be_set |= _PAGE_TLB_INSERTED_MANY;
> +            tlb_track->stat.iod_new_failed++;
> +        }
> +        //tlb_track_printd("new_entry 0x%p\n", new_entry);
> +        tlb_track->stat.iod_new_entry++;
> +        goto again;
> +    }
> +
> +    BUG_ON(pte_tlb_inserted_many(old_pte));
> +    new_pte = __pte(pte_val(old_pte) | bit_to_be_set);
> +    ret_pte = ptep_cmpxchg_rel(mm, vaddr, ptep, old_pte, new_pte);
> +    if (pte_val(old_pte) != pte_val(ret_pte)) {
> +        if (tlb_track_pte_zapped(old_pte, ret_pte)) {
> +            //tlb_track_printd("zapped TLB_TRACK_AGAIN\n");
> +            ret = TLB_TRACK_AGAIN;
> +            tlb_track->stat.iod_again++;
> +            goto out;
> +        }
> +
> +        // Other thread set _PAGE_TLB_INSERTED and/or _PAGE_TLB_INSERTED_MANY
> +        if (pte_tlb_inserted_many(ret_pte)) {
> +            // Other thread already set _PAGE_TLB_INSERTED_MANY and
> +            // removed the entry.
> +            //tlb_track_printd("iserted TLB_TRACK_MANY\n");
> +            BUG_ON(!pte_tlb_inserted(ret_pte));
> +            ret = TLB_TRACK_MANY;
> +            tlb_track->stat.iod_new_many++;
> +            goto out;
> +        }
> +        BUG_ON(pte_tlb_inserted(ret_pte));
> +        BUG();
> +    }
> +    if (new_entry) {
> +        //tlb_track_printd("iserting new_entry 0x%p\n", new_entry);
> +        entry = new_entry;
> +        new_entry = NULL;
> +
> +        entry->ptep = ptep;
> +        entry->pte_val = old_pte;
> +        entry->vaddr = vaddr;
> +        entry->rid = rid;
> +        cpus_clear(entry->pcpu_dirty_mask);
> +        vcpus_clear(entry->vcpu_dirty_mask);
> +        list_add(&entry->list, head);
> +
> +#ifdef CONFIG_TLB_TRACK_CNT
> +        entry->cnt = 0;
> +#endif
> +        tlb_track->stat.iod_insert++;
> +        //tlb_track_entry_printf(entry);
> +    } else {
> +        goto out;
> +    }
> +
> + found:
> +    BUG_ON(v->processor >= NR_CPUS);
> +    cpu_set(v->processor, entry->pcpu_dirty_mask);
> +    BUG_ON(v->vcpu_id >= NR_CPUS);
> +    vcpu_set(v->vcpu_id, entry->vcpu_dirty_mask);
> +    tlb_track->stat.iod_dirtied++;
> +
> + out:
> +    spin_unlock(&tlb_track->hash_lock);
> +    if (ret == TLB_TRACK_MANY && entry != NULL) {
> +        tlb_track_free_entry(tlb_track, entry);
> +    }
> +    if (new_entry != NULL) {
> +        tlb_track_free_entry(tlb_track, new_entry);
> +    }
> +    return ret;
> +}
> +
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> +                               struct p2m_entry* entry)
> +{
> +    unsigned long vrn = vaddr >> IA64_RR_SHIFT;
> +    unsigned long rid = PSCB(vcpu, rrs[vrn]);
> +    TLB_TRACK_RET_T ret;
> +
> +    vaddr = (vaddr << 3) >> 3;// mask rid bit
> +    vaddr &= PAGE_MASK;
> +    ret = tlb_track_insert_or_dirty(vcpu->domain->arch.tlb_track,
> +                                    &vcpu->domain->arch.mm,
> +                                    entry->ptep, entry->used,
> +                                    vaddr, rid);
> +    if (ret == TLB_TRACK_AGAIN) {
> +        p2m_entry_set_retry(entry);
> +    }
> +}
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track,
> +                            volatile pte_t* ptep, pte_t old_pte,
> +                            struct tlb_track_entry** entryp)
> +{
> +    unsigned long mfn = pte_pfn(old_pte);
> +    struct list_head* head = tlb_track_hash_head(tlb_track, ptep);
> +    struct tlb_track_entry* entry;
> +
> +    tlb_track->stat.sar++;
> +    if (!pte_tlb_tracking(old_pte)) {
> +        tlb_track->stat.sar_not_tracked++;
> +        return TLB_TRACK_NOT_TRACKED;
> +    }
> +    if (!pte_tlb_inserted(old_pte)) {
> +        BUG_ON(pte_tlb_inserted_many(old_pte));
> +        tlb_track->stat.sar_not_found++;
> +        return TLB_TRACK_NOT_FOUND;
> +    }
> +    if (pte_tlb_inserted_many(old_pte)) {
> +        BUG_ON(!pte_tlb_inserted(old_pte));
> +        tlb_track->stat.sar_many++;
> +        return TLB_TRACK_MANY;
> +    }
> +
> +    spin_lock(&tlb_track->hash_lock);
> +    list_for_each_entry(entry, head, list) {
> +        if (entry->ptep != ptep) {
> +            continue;
> +        }
> +        if (pte_pfn(entry->pte_val) == mfn) {
> +            list_del(&entry->list);
> +            tlb_track->stat.sar_found++;
> +            spin_unlock(&tlb_track->hash_lock);
> +            *entryp = entry;
> +            //tlb_track_entry_printf(entry);
> +#ifdef CONFIG_TLB_TRACK_CNT
> +            //tlb_track_printd("cnt = %ld\n", entry->cnt);
> +#endif
> +            return TLB_TRACK_FOUND;
> +        }
> +        BUG();
> +    }
> +    BUG();
> +    spin_unlock(&tlb_track->hash_lock);
> +    return TLB_TRACK_NOT_TRACKED;
> +}
> +
> +void
> +tlb_track_stat_printf(const struct tlb_track_stat* stat)
> +{
> +    printk("iod %ld\n"
> +           "iod_again %ld\n"
> +           "iod_not_tracked %ld\n"
> +           "iod_force_many %ld\n"
> +           "iod_tracked_many %ld\n"
> +           "iod_tracked_many_del %ld\n"
> +           "iod_found %ld\n"
> +           "iod_new_entry %ld\n"
> +           "iod_new_failed %ld\n"
> +           "iod_new_many %ld\n"
> +           "iod_insert %ld\n"
> +           "iod_dirtied %ld\n"
> +           "sar %ld\n"
> +           "sar_not_tracked %ld\n"
> +           "sar_not_found %ld\n"
> +           "sar_found %ld\n"
> +           "sar_many %ld\n",
> +           stat->iod,
> +           stat->iod_again,
> +           stat->iod_not_tracked,
> +           stat->iod_force_many,
> +           stat->iod_tracked_many,
> +           stat->iod_tracked_many_del,
> +           stat->iod_found,
> +           stat->iod_new_entry,
> +           stat->iod_new_failed,
> +           stat->iod_new_many,
> +           stat->iod_insert,
> +           stat->iod_dirtied,
> +           stat->sar,
> +           stat->sar_not_tracked,
> +           stat->sar_not_found,
> +           stat->sar_found,
> +           stat->sar_many);
> +}
> +
> +// for debug
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> +                         const struct tlb_track_entry* entry)
> +{
> +    char pcpumask_buf[NR_CPUS + 1];
> +    char vcpumask_buf[MAX_VIRT_CPUS + 1];
> +    cpumask_scnprintf(pcpumask_buf, sizeof(pcpumask_buf),
> +                      entry->pcpu_dirty_mask);
> +    vcpumask_scnprintf(vcpumask_buf, sizeof(vcpumask_buf),
> +                       entry->vcpu_dirty_mask);
> +    printk("%s:%d\n"
> +           "\tmfn 0x%016lx\n"
> +           "\told_pte 0x%016lx ptep 0x%p\n"
> +           "\tpte_val 0x%016lx vaddr 0x%016lx rid %ld\n"
> +           "\tpcpu_dirty_mask %s vcpu_dirty_mask %s\n"
> +           "\tentry 0x%p\n",
> +           func, line,
> +           pte_pfn(entry->pte_val),
> +           pte_val(entry->pte_val), entry->ptep, pte_val(*entry->ptep),
> +           entry->vaddr, entry->rid,
> +           pcpumask_buf, vcpumask_buf,
> +           entry);
> +}
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/p2m_entry.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/p2m_entry.h        Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,76 @@
> +/******************************************************************************
> + * p2m_entry.h
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#ifndef __ASM_P2M_ENTRY_H__
> +#define __ASM_P2M_ENTRY_H__
> +
> +#include <asm/tlb_track.h>
> +
> +struct p2m_entry {
> +#define P2M_PTE_ALWAYS_RETRY ((volatile pte_t*) -1)
> +    volatile pte_t*     ptep;
> +    pte_t               used;
> +};
> +
> +static inline void
> +p2m_entry_set(struct p2m_entry* entry, volatile pte_t* ptep, pte_t used)
> +{
> +    entry->ptep = ptep;
> +    entry->used = used;
> +}
> +
> +static inline void
> +p2m_entry_set_retry(struct p2m_entry* entry)
> +{
> +    entry->ptep = P2M_PTE_ALWAYS_RETRY;
> +}
> +
> +static inline int
> +p2m_entry_retry(struct p2m_entry* entry)
> +{
> +    //XXX see lookup_domain_pte().
> +    //    NULL is set for invalid gpaddr for the time being.
> +    if (entry->ptep == NULL)
> +        return 0;
> +
> +    if (entry->ptep == P2M_PTE_ALWAYS_RETRY)
> +        return 1;
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +    return ((pte_val(*entry->ptep) & ~_PAGE_TLB_TRACK_MASK) !=
> +            (pte_val(entry->used) & ~_PAGE_TLB_TRACK_MASK));
> +#else
> +    return (pte_val(*entry->ptep) != pte_val(entry->used));
> +#endif
> +}
> +
> +#endif // __ASM_P2M_ENTRY_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> diff -r c654d462c448 -r cb0aa2b2e180 xen/include/asm-ia64/tlb_track.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-ia64/tlb_track.h        Mon Jul 24 21:35:16 2006 +0900
> @@ -0,0 +1,201 @@
> +/******************************************************************************
> + * tlb_track.c
> + *
> + * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp>
> + *                    VA Linux Systems Japan K.K.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
> + *
> + */
> +
> +#ifndef __TLB_TRACK_H__
> +#define __TLB_TRACK_H__
> +
> +#ifdef CONFIG_XEN_IA64_TLB_TRACK
> +
> +#include <asm/domain.h>
> +#include <xen/list.h>
> +
> +#define _PAGE_TLB_TRACKING_BIT          53
> +#define _PAGE_TLB_INSERTED_BIT          54
> +#define _PAGE_TLB_INSERTED_MANY_BIT     55
> +
> +#define _PAGE_TLB_TRACKING              (1UL << _PAGE_TLB_TRACKING_BIT)
> +#define _PAGE_TLB_INSERTED              (1UL << _PAGE_TLB_INSERTED_BIT)
> +#define _PAGE_TLB_INSERTED_MANY         (1UL << _PAGE_TLB_INSERTED_MANY_BIT)
> +#define _PAGE_TLB_TRACK_MASK            (_PAGE_TLB_TRACKING | 
> _PAGE_TLB_INSERTED | _PAGE_TLB_INSERTED_MANY)
> +
> +#define pte_tlb_tracking(pte)                   \
> +    ((pte_val(pte) & _PAGE_TLB_TRACKING) != 0)
> +#define pte_tlb_inserted(pte)                   \
> +    ((pte_val(pte) & _PAGE_TLB_INSERTED) != 0)
> +#define pte_tlb_inserted_many(pte)                  \
> +    ((pte_val(pte) & _PAGE_TLB_INSERTED_MANY) != 0)
> +
> +
> +// vcpu mask
> +// stolen from cpumask.h
> +typedef struct { DECLARE_BITMAP(bits, MAX_VIRT_CPUS); } vcpumask_t;
> +
> +#define vcpu_set(vcpu, dst) __vcpu_set((vcpu), &(dst))
> +static inline void __vcpu_set(int vcpu, volatile vcpumask_t *dstp)
> +{
> +     set_bit(vcpu, dstp->bits);
> +}
> +#define vcpus_clear(dst) __vcpus_clear(&(dst), MAX_VIRT_CPUS)
> +static inline void __vcpus_clear(vcpumask_t *dstp, int nbits)
> +{
> +     bitmap_zero(dstp->bits, nbits);
> +}
> +/* No static inline type checking - see Subtlety (1) above. */
> +#define vcpu_isset(vcpu, vcpumask) test_bit((vcpu), (vcpumask).bits)
> +
> +#define vcpumask_scnprintf(buf, len, src) \
> +                     __vcpumask_scnprintf((buf), (len), &(src), 
> MAX_VIRT_CPUS)
> +static inline int __vcpumask_scnprintf(char *buf, int len,
> +                                     const vcpumask_t *srcp, int nbits)
> +{
> +     return bitmap_scnprintf(buf, len, srcp->bits, nbits);
> +}
> +
> +
> +// TODO: compact this structure.
> +struct tlb_track_entry {
> +    struct list_head   list;
> +
> +    
> +    volatile pte_t*     ptep;            // corresponding p2m entry
> +
> +    //XXX should we use TR_ENTRY?
> +    pte_t               pte_val;        // mfn and other flags
> +                                        // pte_val.p = 1:
> +                                        //   tlb entry is inserted.
> +                                        // pte_val.p = 0: 
> +                                        //   once tlb entry is inserted, so
> +                                        //   this entry is created. But tlb
> +                                        //   purge is isseued, so this
> +                                        //   virtual address need not to be
> +                                        //   purged.
> +    unsigned long       vaddr;          // virtual address
> +    unsigned long       rid;            // rid
> +
> +    cpumask_t           pcpu_dirty_mask;
> +    vcpumask_t          vcpu_dirty_mask;
> +    // tlbflush_timestamp;
> +
> +#define CONFIG_TLB_TRACK_CNT
> +#ifdef CONFIG_TLB_TRACK_CNT
> +#define TLB_TRACK_CNT_FORCE_MANY        256 //XXX how many?
> +    unsigned long       cnt;
> +#endif
> +};
> +
> +struct tlb_track_stat {
> +    // insert or dirty
> +    unsigned long       iod;
> +    unsigned long       iod_again;
> +    unsigned long       iod_not_tracked;
> +    unsigned long       iod_force_many;
> +    unsigned long       iod_tracked_many;
> +    unsigned long       iod_tracked_many_del;
> +    unsigned long       iod_found;
> +    unsigned long       iod_new_entry;
> +    unsigned long       iod_new_failed;
> +    unsigned long       iod_new_many;
> +    unsigned long       iod_insert;
> +    unsigned long       iod_dirtied;
> +    
> +    // search and remove
> +    unsigned long       sar;
> +    unsigned long       sar_not_tracked;
> +    unsigned long       sar_not_found;
> +    unsigned long       sar_found;
> +    unsigned long       sar_many;
> +};
> +void tlb_track_stat_printf(const struct tlb_track_stat* stat); 
> +
> +struct tlb_track {
> +
> +// see __gnttab_map_grant_ref()
> +// A domain can map granted-page up to MAPTRACK_MAX_ENTRIES pages.
> +#define TLB_TRACK_LIMIT_ENTRIES                                     \
> +    (MAPTRACK_MAX_ENTRIES * (PAGE_SIZE / sizeof(struct tlb_track)))
> +
> +    spinlock_t                  free_list_lock;
> +    struct list_head            free_list;
> +    unsigned int                limit;
> +    unsigned int                num_entries;
> +    unsigned int                num_free;
> +    struct list_head            page_list;
> +
> +    // XXX hash table size
> +    spinlock_t                  hash_lock;
> +    unsigned int                hash_size;
> +    unsigned int                hash_shift;
> +    unsigned int                hash_mask;
> +    struct list_head*           hash;
> +
> +    struct tlb_track_stat       stat;
> +};
> +
> +int tlb_track_create(struct domain* d);
> +void tlb_track_destroy(struct domain* d);
> +
> +void tlb_track_free_entry(struct tlb_track* tlb_track,
> +                          struct tlb_track_entry* entry);
> +
> +struct p2m_entry;
> +void
> +vcpu_tlb_track_insert_or_dirty(struct vcpu *vcpu, unsigned long vaddr,
> +                               struct p2m_entry* entry);
> +
> +// return value
> +// NULL if this entry is used
> +// entry if this entry isn't used
> +enum TLB_TRACK_RET {
> +    TLB_TRACK_NOT_TRACKED,
> +    TLB_TRACK_NOT_FOUND,
> +    TLB_TRACK_FOUND,
> +    TLB_TRACK_MANY,
> +    TLB_TRACK_AGAIN,
> +};
> +typedef enum TLB_TRACK_RET TLB_TRACK_RET_T;
> +
> +TLB_TRACK_RET_T
> +tlb_track_search_and_remove(struct tlb_track* tlb_track, 
> +                            volatile pte_t* ptep, pte_t old_pte, 
> +                            struct tlb_track_entry** entryp);
> +
> +void
> +__tlb_track_entry_printf(const char* func, int line,
> +                         const struct tlb_track_entry* entry);
> +#define tlb_track_entry_printf(entry)                       \
> +    __tlb_track_entry_printf(__func__, __LINE__, (entry))
> +#else
> +//define nop
> +
> +#endif // CONFIG_XEN_IA64_TLB_TRACK
> +
> +#endif // __TLB_TRACK_H__
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-set-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */

> # HG changeset patch
> # User yamahata@xxxxxxxxxxxxx
> # Node ID a56d48066373c9fe317e986580c08394fe89fc7e
> # Parent  cb0aa2b2e180d76d09592ed32338f9cb4ac5b7a0
> implement per vcpu vhpt option. allocate VHPT per vcpu.
> added compile time option, xen_ia64_pervcpu_vhpt=y, to enable it.
> added xen boot time option, pervcpu_vhpt=0, to disable it.
> This patch depends on tlb tracking patch.
> PATCHNAME: pervcpu_vhpt
> 
> Signed-off-by: Isaku Yamahata <yamahata@xxxxxxxxxxxxx>
> 
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/Rules.mk
> --- a/xen/arch/ia64/Rules.mk  Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/Rules.mk  Mon Jul 24 21:37:15 2006 +0900
> @@ -42,6 +42,9 @@ ifeq ($(xen_ia64_tlb_track),y)
>  ifeq ($(xen_ia64_tlb_track),y)
>  CFLAGS       += -DCONFIG_XEN_IA64_TLB_TRACK
>  endif
> +ifeq ($(xen_ia64_pervcpu_vhpt),y)
> +CFLAGS       += -DCONFIG_XEN_IA64_PERVCPU_VHPT
> +endif
>  ifeq ($(no_warns),y)
>  CFLAGS       += -Wa,--fatal-warnings -Werror -Wno-uninitialized
>  endif
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/domain.c
> --- a/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/domain.c      Mon Jul 24 21:37:15 2006 +0900
> @@ -117,8 +117,12 @@ static void flush_vtlb_for_context_switc
>               if (VMX_DOMAIN(vcpu)) {
>                       // currently vTLB for vt-i domian is per vcpu.
>                       // so any flushing isn't needed.
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +             } else if (HAS_PERVCPU_VHPT(v->domain)) {
> +                     // nothing to do
> +#endif
>               } else {
> -                     vhpt_flush();
> +                     local_vhpt_flush();
>               }
>               local_flush_tlb_all();
>       }
> @@ -133,9 +137,13 @@ void schedule_tail(struct vcpu *prev)
>               vmx_do_launch(current);
>       } else {
>               ia64_set_iva(&ia64_ivt);
> -             ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -                     VHPT_ENABLED);
> +             // disable VHPT. ia64_new_rr7() might cause VHPT
> +             // fault without this because it flushes dtr[IA64_TR_VHPT]
> +             // (VHPT_SIZE_LOG2 << 2) is just for avoid
> +             // Reserved Register/Field fault.
> +             ia64_set_pta(VHPT_SIZE_LOG2 << 2);
>               load_region_regs(current);
> +             ia64_set_pta(vcpu_pta(current));
>               vcpu_load_kernel_regs(current);
>               __ia64_per_cpu_var(current_psr_i_addr) = &current->domain->
>                 shared_info->vcpu_info[current->vcpu_id].evtchn_upcall_mask;
> @@ -186,9 +194,13 @@ if (!i--) { i = 1000000; printk("+"); }
>  
>       nd = current->domain;
>       if (!is_idle_domain(nd)) {
> -             ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -                          VHPT_ENABLED);
> +             // disable VHPT. ia64_new_rr7() might cause VHPT
> +             // fault without this because it changes dtr[IA64_TR_VHPT]
> +             // (VHPT_SIZE_LOG2 << 2) is just for avoid
> +             // Reserved Register/Field fault.
> +             ia64_set_pta(VHPT_SIZE_LOG2 << 2);
>               load_region_regs(current);
> +             ia64_set_pta(vcpu_pta(current));
>               vcpu_load_kernel_regs(current);
>               vcpu_set_next_timer(current);
>               if (vcpu_timer_expired(current))
> @@ -305,6 +317,17 @@ struct vcpu *alloc_vcpu_struct(struct do
>           v->arch.ending_rid = d->arch.ending_rid;
>           v->arch.breakimm = d->arch.breakimm;
>           v->arch.last_processor = INVALID_PROCESSOR;
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +        if (HAS_PERVCPU_VHPT(d)) {
> +            if (pervcpu_vhpt_alloc(v) < 0) {
> +                free_xenheap_pages(v->arch.privregs,
> +                                   get_order(sizeof(mapped_regs_t)));
> +                free_xenheap_pages(v, KERNEL_STACK_SIZE_ORDER);
> +                return NULL;
> +            }
> +        }
> +#endif
>       }
>  
>       return v;
> @@ -315,6 +338,10 @@ void free_vcpu_struct(struct vcpu *v)
>       if (VMX_DOMAIN(v))
>               vmx_relinquish_vcpu_resources(v);
>       else {
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +        if (HAS_PERVCPU_VHPT(v->domain))
> +            pervcpu_vhpt_free(v);
> +#endif
>               if (v->arch.privregs != NULL)
>                       free_xenheap_pages(v->arch.privregs,
>                                     get_order_from_shift(XMAPPEDREGS_SHIFT));
> @@ -340,6 +367,11 @@ static void init_switch_stack(struct vcp
>       memset(v->arch._thread.fph,0,sizeof(struct ia64_fpreg)*96);
>  }
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static int opt_pervcpu_vhpt = 1;
> +integer_param("pervcpu_vhpt", opt_pervcpu_vhpt);
> +#endif
> +
>  int arch_domain_create(struct domain *d)
>  {
>       int i;
> @@ -354,6 +386,13 @@ int arch_domain_create(struct domain *d)
>       if (is_idle_domain(d))
>           return 0;
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     d->arch.has_pervcpu_vhpt = opt_pervcpu_vhpt;
> +#if 1
> +     DPRINTK("%s:%d domain %d pervcpu_vhpt %d\n",
> +             __func__, __LINE__, d->domain_id, d->arch.has_pervcpu_vhpt);
> +#endif
> +#endif
>  #ifdef CONFIG_XEN_IA64_TLB_TRACK
>       if (tlb_track_create(d) < 0)
>         goto fail_nomem;
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/regionreg.c
> --- a/xen/arch/ia64/xen/regionreg.c   Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/regionreg.c   Mon Jul 24 21:37:15 2006 +0900
> @@ -260,7 +260,7 @@ int set_one_rr(unsigned long rr, unsigne
>       } else if (rreg == 7) {
>               ia64_new_rr7(vmMangleRID(newrrv.rrval),v->domain->shared_info,
>                            v->arch.privregs, v->domain->arch.shared_info_va,
> -                          __get_cpu_var(vhpt_paddr));
> +                          vcpu_vhpt_maddr(v));
>       } else {
>               set_rr(rr,newrrv.rrval);
>       }
> diff -r cb0aa2b2e180 -r a56d48066373 xen/arch/ia64/xen/vhpt.c
> --- a/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/arch/ia64/xen/vhpt.c        Mon Jul 24 21:37:15 2006 +0900
> @@ -23,18 +23,30 @@ DEFINE_PER_CPU (unsigned long, vhpt_padd
>  DEFINE_PER_CPU (unsigned long, vhpt_paddr);
>  DEFINE_PER_CPU (unsigned long, vhpt_pend);
>  
> -void vhpt_flush(void)
> -{
> -     struct vhpt_lf_entry *v = __va(__ia64_per_cpu_var(vhpt_paddr));
> +static void __vhpt_flush(unsigned long vhpt_maddr)
> +{
> +     struct vhpt_lf_entry *v =(struct vhpt_lf_entry*)__va(vhpt_maddr);
>       int i;
>  
>       for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++)
>               v->ti_tag = INVALID_TI_TAG;
>  }
>  
> -static void vhpt_erase(void)
> -{
> -     struct vhpt_lf_entry *v = (struct vhpt_lf_entry *)VHPT_ADDR;
> +void local_vhpt_flush(void)
> +{
> +     __vhpt_flush(__ia64_per_cpu_var(vhpt_paddr));
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush(struct vcpu* v)
> +{
> +     __vhpt_flush(vcpu_vhpt_maddr(v));
> +}
> +#endif
> +
> +static void vhpt_erase(unsigned long vhpt_maddr)
> +{
> +     struct vhpt_lf_entry *v = (struct vhpt_lf_entry*)__va(vhpt_maddr);
>       int i;
>  
>       for (i = 0; i < VHPT_NUM_ENTRIES; i++, v++) {
> @@ -44,17 +56,6 @@ static void vhpt_erase(void)
>               v->ti_tag = INVALID_TI_TAG;
>       }
>       // initialize cache too???
> -}
> -
> -
> -static void vhpt_map(unsigned long pte)
> -{
> -     unsigned long psr;
> -
> -     psr = ia64_clear_ic();
> -     ia64_itr(0x2, IA64_TR_VHPT, VHPT_ADDR, pte, VHPT_SIZE_LOG2);
> -     ia64_set_psr(psr);
> -     ia64_srlz_i();
>  }
>  
>  void vhpt_insert (unsigned long vadr, unsigned long pte, unsigned long logps)
> @@ -101,7 +102,7 @@ void vhpt_multiple_insert(unsigned long 
>  
>  void vhpt_init(void)
>  {
> -     unsigned long paddr, pte;
> +     unsigned long paddr;
>       struct page_info *page;
>  #if !VHPT_ENABLED
>       return;
> @@ -121,13 +122,54 @@ void vhpt_init(void)
>       __get_cpu_var(vhpt_pend) = paddr + (1 << VHPT_SIZE_LOG2) - 1;
>       printf("vhpt_init: vhpt paddr=0x%lx, end=0x%lx\n",
>               paddr, __get_cpu_var(vhpt_pend));
> -     pte = pte_val(pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL));
> -     vhpt_map(pte);
> -     ia64_set_pta(VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) |
> -             VHPT_ENABLED);
> -     vhpt_erase();
> -}
> -
> +     vhpt_erase(paddr);
> +     // we don't enable VHPT here.
> +     // context_switch() or schedule_tail() does it.
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +int
> +pervcpu_vhpt_alloc(struct vcpu *v)
> +{
> +     unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> +     DPRINTK("%s:%d allocating d 0x%p %d v 0x%p %d\n",
> +             __func__, __LINE__,
> +             v->domain, v->domain->domain_id,
> +             v, v->vcpu_id);
> +
> +     v->arch.vhpt_entries =
> +             (1UL << vhpt_size_log2) / sizeof(struct vhpt_lf_entry);
> +     v->arch.vhpt_page =
> +             alloc_domheap_pages(NULL, vhpt_size_log2 - PAGE_SHIFT, 0);
> +     if (!v->arch.vhpt_page)
> +             return -ENOMEM;
> +     
> +     v->arch.vhpt_maddr = page_to_maddr(v->arch.vhpt_page);
> +     if (v->arch.vhpt_maddr & ((1 << VHPT_SIZE_LOG2) - 1))
> +             panic("pervcpu_vhpt_init: bad VHPT alignment!\n");
> +
> +     v->arch.pta.val = 0; // zero clear
> +     v->arch.pta.ve = 1; // enable vhpt
> +     v->arch.pta.size = VHPT_SIZE_LOG2;
> +     v->arch.pta.vf = 1; // long format
> +     v->arch.pta.base = v->arch.vhpt_maddr >> 15;
> +
> +     vhpt_erase(v->arch.vhpt_maddr);
> +     return 0;
> +}
> +
> +void
> +pervcpu_vhpt_free(struct vcpu *v)
> +{
> +     unsigned long vhpt_size_log2 = VHPT_SIZE_LOG2;
> +     DPRINTK("%s:%d freeing d 0x%p %d v 0x%p %d\n",
> +             __func__, __LINE__,
> +             v->domain, v->domain->domain_id,
> +             v, v->vcpu_id);
> +
> +     free_domheap_pages(v->arch.vhpt_page, vhpt_size_log2 - PAGE_SHIFT);
> +}
> +#endif
>  
>  void vcpu_flush_vtlb_all(struct vcpu *v)
>  {
> @@ -136,7 +178,15 @@ void vcpu_flush_vtlb_all(struct vcpu *v)
>       vcpu_purge_tr_entry(&PSCBX(v,itlb));
>  
>       /* Then VHPT.  */
> -     vhpt_flush ();
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(v->domain.arch)) {
> +             vcpu_vhpt_flush(v);
> +     } else {
> +             local_vhpt_flush();
> +     }
> +#else
> +     local_vhpt_flush();
> +#endif
>  
>       /* Then mTLB.  */
>       local_flush_tlb_all ();
> @@ -169,9 +219,10 @@ void domain_flush_vtlb_all (void)
>       }
>  }
>  
> -static void cpu_flush_vhpt_range (int cpu, u64 vadr, u64 addr_range)
> -{
> -     void *vhpt_base = __va(per_cpu(vhpt_paddr, cpu));
> +static void __flush_vhpt_range(unsigned long vhpt_maddr,
> +                              u64 vadr, u64 addr_range)
> +{
> +     void *vhpt_base = __va(vhpt_maddr);
>  
>       while ((long)addr_range > 0) {
>               /* Get the VHPT entry.  */
> @@ -184,9 +235,30 @@ static void cpu_flush_vhpt_range (int cp
>       }
>  }
>  
> +static void cpu_vhpt_flush_range(int cpu, u64 vadr, u64 addr_range)
> +{
> +     __flush_vhpt_range(per_cpu(vhpt_paddr, cpu), vadr, addr_range);
> +}
> +
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +static void vcpu_vhpt_flush_range(struct vcpu* v, u64 vadr, u64 addr_range)
> +{
> +     __flush_vhpt_range(vcpu_vhpt_maddr(v), vadr, addr_range);
> +}
> +#endif
> +
>  void vcpu_flush_tlb_vhpt_range (u64 vadr, u64 log_range)
>  {
> -     cpu_flush_vhpt_range (current->processor, vadr, 1UL << log_range);
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(current->domain.arch)) {
> +             vcpu_vhpt_flush_range(current, vadr, 1UL << log_range);
> +     } else {
> +             cpu_vhpt_flush_range(current->processor,
> +                                  vadr, 1UL << log_range);
> +     }
> +#else
> +     cpu_vhpt_flush_range(current->processor, vadr, 1UL << log_range);
> +#endif
>       ia64_ptcl(vadr, log_range << 2);
>       ia64_srlz_i();
>  }
> @@ -218,8 +290,17 @@ void domain_flush_vtlb_range (struct dom
>               if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
>                       continue;
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +             if (HAS_PERVCPU_VHPT(d->arch)) {
> +                     vcpu_vhpt_flush_range(v, vadr, addr_range);
> +             } else {
> +                     /* Invalidate VHPT entries.  */
> +                     cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> +             }
> +#else
>               /* Invalidate VHPT entries.  */
> -             cpu_flush_vhpt_range (v->processor, vadr, addr_range);
> +             cpu_vhpt_flush_range(v->processor, vadr, addr_range);
> +#endif
>       }
>       // ptc.ga has release semantics.
>  
> @@ -254,11 +335,30 @@ domain_flush_vltb_track_entry(struct dom
>       }
>       smp_mb();
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(d->arch)) {
> +             for_each_vcpu(d, v) {
> +                     if (!test_bit(_VCPUF_initialised, &v->vcpu_flags))
> +                             continue;
> +                     if (!vcpu_isset(v->vcpu_id, entry->vcpu_dirty_mask))
> +                             continue;
> +
> +                     /* Invalidate VHPT entries.  */
> +                     vcpu_vhpt_flush_range(v, entry->vaddr, PAGE_SIZE);
> +             }
> +     } else {
> +             for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
> +                     /* Invalidate VHPT entries.  */
> +                     cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> +             }
> +     }
> +#else
>       for_each_cpu_mask(cpu, entry->pcpu_dirty_mask) {
>               //printk("%s:%d cpu %d\n", __func__, __LINE__, cpu);
>               /* Invalidate VHPT entries.  */
> -             cpu_flush_vhpt_range(cpu, entry->vaddr, PAGE_SIZE);
> -     }
> +             cpu_vhpt_flush_range(cpu, entry->vaddr, PAGE_SIZE);
> +     }
> +#endif
>       // ptc.ga has release semantics.
>  
>       /* ptc.ga  */
> @@ -272,7 +372,7 @@ static void flush_tlb_vhpt_all (struct d
>  static void flush_tlb_vhpt_all (struct domain *d)
>  {
>       /* First VHPT.  */
> -     vhpt_flush ();
> +     local_vhpt_flush ();
>  
>       /* Then mTLB.  */
>       local_flush_tlb_all ();
> @@ -281,7 +381,14 @@ void domain_flush_destroy (struct domain
>  void domain_flush_destroy (struct domain *d)
>  {
>       /* Very heavy...  */
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +     if (HAS_PERVCPU_VHPT(d->arch))
> +             on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
> +     else
> +             on_each_cpu((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#else
>       on_each_cpu ((void (*)(void *))flush_tlb_vhpt_all, d, 1, 1);
> +#endif
>       cpus_clear (d->domain_dirty_cpumask);
>  }
>  
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/domain.h
> --- a/xen/include/asm-ia64/domain.h   Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/domain.h   Mon Jul 24 21:37:15 2006 +0900
> @@ -63,6 +63,9 @@ struct arch_domain {
>          unsigned long flags;
>          struct {
>              unsigned int is_vti : 1;
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +            unsigned int has_pervcpu_vhpt : 1;
> +#endif
>          };
>      };
>  
> @@ -108,6 +111,13 @@ struct arch_domain {
>  #define INT_ENABLE_OFFSET(v)                   \
>      (sizeof(vcpu_info_t) * (v)->vcpu_id + \
>      offsetof(vcpu_info_t, evtchn_upcall_mask))
> +
> +#ifdef CONFIG_XEN_IA64_PER_VCPU_VHPT
> +#define HAS_PERVCPU_VHPT(d)     ((d)->has_pervcpu_vhpt)
> +#else
> +#define HAS_PERVCPU_VHPT(d)     (0)
> +#endif
> +
>  
>  struct arch_vcpu {
>      /* Save the state of vcpu.
> @@ -158,6 +168,13 @@ struct arch_vcpu {
>      fpswa_ret_t fpswa_ret;   /* save return values of FPSWA emulation */
>      struct arch_vmx_struct arch_vmx; /* Virtual Machine Extensions */
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    PTA                 pta;
> +    unsigned long       vhpt_maddr;
> +    struct page_info*   vhpt_page;
> +    unsigned long       vhpt_entries;
> +#endif
> +
>  #define INVALID_PROCESSOR       INT_MAX
>      int last_processor;
>  };
> diff -r cb0aa2b2e180 -r a56d48066373 xen/include/asm-ia64/vhpt.h
> --- a/xen/include/asm-ia64/vhpt.h     Mon Jul 24 21:35:16 2006 +0900
> +++ b/xen/include/asm-ia64/vhpt.h     Mon Jul 24 21:37:15 2006 +0900
> @@ -42,11 +42,47 @@ extern void vhpt_multiple_insert(unsigne
>                                unsigned long logps);
>  extern void vhpt_insert (unsigned long vadr, unsigned long pte,
>                        unsigned long logps);
> -void vhpt_flush(void);
> +void local_vhpt_flush(void);
>  
>  /* Currently the VHPT is allocated per CPU.  */
>  DECLARE_PER_CPU (unsigned long, vhpt_paddr);
>  DECLARE_PER_CPU (unsigned long, vhpt_pend);
>  
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +#if !VHPT_ENABLED
> +#error "VHPT_ENABLED must be set for CONFIG_XEN_IA64_PERVCPU_VHPT"
> +#endif
> +#include <xen/sched.h>
> +int pervcpu_vhpt_alloc(struct vcpu *v);
> +void pervcpu_vhpt_free(struct vcpu *v);
> +static inline unsigned long
> +vcpu_vhpt_maddr(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    if (HAS_PERVCPU_VHPT(v->domain)) {
> +        return v->arch.vhpt_maddr;
> +    }
> +#endif
> +
> +#if 0
> +    // referencecing v->processor is racy.
> +    return per_cpu(vhpt_paddr, v->processor);
> +#endif
> +    BUG_ON(v != current);
> +    return __get_cpu_var(vhpt_paddr);
> +}
> +
> +static inline unsigned long
> +vcpu_pta(struct vcpu* v)
> +{
> +#ifdef CONFIG_XEN_IA64_PERVCPU_VHPT
> +    if (HAS_PERVCPU_VHPT(v->domain)) {
> +        return v->arch.pta.val;
> +    }
> +#endif
> +    return VHPT_ADDR | (1 << 8) | (VHPT_SIZE_LOG2 << 2) | VHPT_ENABLED;
> +}
> +#endif
> +
>  #endif /* !__ASSEMBLY */
>  #endif

> _______________________________________________
> Xen-ia64-devel mailing list
> Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
> http://lists.xensource.com/xen-ia64-devel

-- 
yamahata

Attachment: 10701:3cee9325a6c6_import_linux_hash.h.patch
Description: Text document

Attachment: 10702:b90fff753ca1_tlb_track.patch
Description: Text document

Attachment: 10703:f9b91b850f7b_pervcpu_vhpt.patch
Description: Text document

_______________________________________________
Xen-ia64-devel mailing list
Xen-ia64-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-ia64-devel
<Prev in Thread] Current Thread [Next in Thread>