[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Xen-devel] [PATCH] MCE: Fix race condition in mctelem_reserve



Yes,
  all these patches came from some customer reports. We manage with
xen-mceinj to reproduce the problem and got these types of races. These
patches (the other one is already applied) fix the race issues we found.

I tested with a CentOS 6.4 version with mcelog.

Frediano

On Tue, 2014-02-18 at 08:42 +0000, Liu, Jinsong wrote:
> This logic (mctelem) is related to dom0 mcelog logic. Have you tested if 
> mcelog works fine with your patch?
> 
> Thanks,
> Jinsong
> 
> Frediano Ziglio wrote:
> > These lines (in mctelem_reserve)
> > 
> > 
> >         newhead = oldhead->mcte_next;
> >         if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
> > 
> > are racy. After you read the newhead pointer it can happen that
> > another 
> > flow (thread or recursive invocation) change all the list but set head
> > with same value. So oldhead is the same as *freelp but you are setting
> > a new head that could point to whatever element (even already used).
> > 
> > This patch use instead a bit array and atomic bit operations.
> > 
> > Actually it use unsigned long instead of bitmap type as testing for
> > all zeroes is easier.
> > 
> > Signed-off-by: Frediano Ziglio <frediano.ziglio@xxxxxxxxxx>
> > ---
> >  xen/arch/x86/cpu/mcheck/mctelem.c |   52
> >  ++++++++++++++++++++++--------------- 1 file changed, 31
> > insertions(+), 21 deletions(-) 
> > 
> > diff --git a/xen/arch/x86/cpu/mcheck/mctelem.c
> > b/xen/arch/x86/cpu/mcheck/mctelem.c index 895ce1a..e56b6fb 100644
> > --- a/xen/arch/x86/cpu/mcheck/mctelem.c
> > +++ b/xen/arch/x86/cpu/mcheck/mctelem.c
> > @@ -69,6 +69,11 @@ struct mctelem_ent {
> >  #define    MC_URGENT_NENT          10
> >  #define    MC_NONURGENT_NENT       20
> > 
> > +/* Check if we can fit enough bits in the free bit array */
> > +#if MC_URGENT_NENT + MC_NONURGENT_NENT > BITS_PER_LONG
> > +#error Too much elements
> > +#endif
> > +
> >  #define    MC_NCLASSES             (MC_NONURGENT + 1)
> > 
> >  #define    COOKIE2MCTE(c)          ((struct mctelem_ent *)(c))
> > @@ -77,11 +82,9 @@ struct mctelem_ent {
> >  static struct mc_telem_ctl {
> >     /* Linked lists that thread the array members together.
> >      *
> > -    * The free lists are singly-linked via mcte_next, and we allocate
> > -    * from them by atomically unlinking an element from the head.
> > -    * Consumed entries are returned to the head of the free list.
> > -    * When an entry is reserved off the free list it is not linked
> > -    * on any list until it is committed or dismissed.
> > +    * The free lists is a bit array where bit 1 means free.
> > +    * This as element number is quite small and is easy to
> > +    * atomically allocate that way.
> >      *
> >      * The committed list grows at the head and we do not maintain a
> >      * tail pointer; insertions are performed atomically.  The head
> > @@ -101,7 +104,7 @@ static struct mc_telem_ctl {
> >      * we can lock it for updates.  The head of the processing list
> >      * always has the oldest telemetry, and we append (as above)
> >      * at the tail of the processing list. */
> > -   struct mctelem_ent *mctc_free[MC_NCLASSES];
> > +   unsigned long mctc_free[MC_NCLASSES];
> >     struct mctelem_ent *mctc_committed[MC_NCLASSES];
> >     struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
> >     struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
> > @@ -214,7 +217,10 @@ static void mctelem_free(struct mctelem_ent *tep)
> >     BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
> > 
> >     tep->mcte_prev = NULL;
> > -   mctelem_xchg_head(&mctctl.mctc_free[target], &tep->mcte_next, tep);
> > +   tep->mcte_next = NULL;
> > +
> > +   /* set free in array */
> > +   set_bit(tep - mctctl.mctc_elems, &mctctl.mctc_free[target]);
> >  }
> > 
> >  /* Increment the reference count of an entry that is not linked on to
> > @@ -284,7 +290,7 @@ void mctelem_init(int reqdatasz)
> >     }
> > 
> >     for (i = 0; i < MC_URGENT_NENT + MC_NONURGENT_NENT; i++) {
> > -           struct mctelem_ent *tep, **tepp;
> > +           struct mctelem_ent *tep;
> > 
> >             tep = mctctl.mctc_elems + i;
> >             tep->mcte_flags = MCTE_F_STATE_FREE;
> > @@ -292,16 +298,15 @@ void mctelem_init(int reqdatasz)
> >             tep->mcte_data = datarr + i * datasz;
> > 
> >             if (i < MC_URGENT_NENT) {
> > -                   tepp = &mctctl.mctc_free[MC_URGENT];
> > -                   tep->mcte_flags |= MCTE_F_HOME_URGENT;
> > +                   __set_bit(i, &mctctl.mctc_free[MC_URGENT]);
> > +                   tep->mcte_flags = MCTE_F_HOME_URGENT;
> >             } else {
> > -                   tepp = &mctctl.mctc_free[MC_NONURGENT];
> > -                   tep->mcte_flags |= MCTE_F_HOME_NONURGENT;
> > +                   __set_bit(i, &mctctl.mctc_free[MC_NONURGENT]);
> > +                   tep->mcte_flags = MCTE_F_HOME_NONURGENT;
> >             }
> > 
> > -           tep->mcte_next = *tepp;
> > +           tep->mcte_next = NULL;
> >             tep->mcte_prev = NULL;
> > -           *tepp = tep;
> >     }
> >  }
> > 
> > @@ -310,18 +315,21 @@ static int mctelem_drop_count;
> > 
> >  /* Reserve a telemetry entry, or return NULL if none available.
> >   * If we return an entry then the caller must subsequently call
> > exactly one of - * mctelem_unreserve or mctelem_commit for that entry.
> > + * mctelem_dismiss or mctelem_commit for that entry.
> >   */
> >  mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
> >  {
> > -   struct mctelem_ent **freelp;
> > -   struct mctelem_ent *oldhead, *newhead;
> > +   unsigned long *freelp;
> > +   unsigned long oldfree;
> > +   unsigned bit;
> >     mctelem_class_t target = (which == MC_URGENT) ?
> >         MC_URGENT : MC_NONURGENT;
> > 
> >     freelp = &mctctl.mctc_free[target];
> >     for (;;) {
> > -           if ((oldhead = *freelp) == NULL) {
> > +           oldfree = *freelp;
> > +
> > +           if (oldfree == 0) {
> >                     if (which == MC_URGENT && target == MC_URGENT) {
> >                             /* raid the non-urgent freelist */
> >                             target = MC_NONURGENT;
> > @@ -333,9 +341,11 @@ mctelem_cookie_t mctelem_reserve(mctelem_class_t
> >                     which) }
> >             }
> > 
> > -           newhead = oldhead->mcte_next;
> > -           if (cmpxchgptr(freelp, oldhead, newhead) == oldhead) {
> > -                   struct mctelem_ent *tep = oldhead;
> > +           /* try to allocate, atomically clear free bit */
> > +           bit = find_first_set_bit(oldfree);
> > +           if (test_and_clear_bit(bit, freelp)) {
> > +                   /* return element we got */
> > +                   struct mctelem_ent *tep = mctctl.mctc_elems + bit;
> > 
> >                     mctelem_hold(tep);
> >                     MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
> 



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxx
http://lists.xen.org/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.