[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [Xen-devel] 4.11.0 RC1 panic
>>> On 10.06.18 at 12:57, <bouyer@xxxxxxxxxxxxxxx> wrote: > (XEN) Xen call trace: > (XEN) [<ffff82d080284bd2>] mm.c#dec_linear_entries+0x12/0x20 > (XEN) [<ffff82d08028922e>] mm.c#_put_page_type+0x13e/0x350 > (XEN) [<ffff82d08023a00d>] _spin_lock+0xd/0x50 > (XEN) [<ffff82d0802898af>] mm.c#put_page_from_l2e+0xdf/0x110 > (XEN) [<ffff82d080288c59>] free_page_type+0x2f9/0x790 > (XEN) [<ffff82d0802891f7>] mm.c#_put_page_type+0x107/0x350 > (XEN) [<ffff82d0802898ef>] put_page_type_preemptible+0xf/0x10 > (XEN) [<ffff82d080272adb>] domain.c#relinquish_memory+0xab/0x460 > (XEN) [<ffff82d080276ae3>] domain_relinquish_resources+0x203/0x290 > (XEN) [<ffff82d0802068bd>] domain_kill+0xbd/0x150 > (XEN) [<ffff82d0802039e3>] do_domctl+0x7d3/0x1a90 > (XEN) [<ffff82d080203210>] do_domctl+0/0x1a90 > (XEN) [<ffff82d080367b95>] pv_hypercall+0x1f5/0x430 > (XEN) [<ffff82d08036e422>] lstar_enter+0xa2/0x120 > (XEN) [<ffff82d08036e42e>] lstar_enter+0xae/0x120 > (XEN) [<ffff82d08036e422>] lstar_enter+0xa2/0x120 > (XEN) [<ffff82d08036e42e>] lstar_enter+0xae/0x120 > (XEN) [<ffff82d08036e422>] lstar_enter+0xa2/0x120 > (XEN) [<ffff82d08036e42e>] lstar_enter+0xae/0x120 > (XEN) [<ffff82d08036e48c>] lstar_enter+0x10c/0x120 Let's focus on this scenario for now, as it is under better (timing) control on the Xen side. Below is a first debugging patch which - avoids the ASSERT() in question, instead triggering a printk(), in the hope that the data logged and/or other ASSERT()s shed some additional light on the situation - logs cleanup activity (this is likely to be quite chatty, so be sure you set up large enough internal buffers) Ideally, if no other ASSERT() triggers as a result of the bypassed one, you'd try to catch more than a single instance of the problem, so we can see a possible pattern (if there is one). A simplistic first XTF test I've created based on your description of the L2 handling model in NetBSD did not trigger the interesting printk(), but at least that way I've been able to see that the domain cleanup logging produces useful data. At the very least I hope that with this we can derive whether the root of the problem is at page table teardown / cleanup time, or with management of live ones. Jan --- unstable.orig/xen/arch/x86/domain.c +++ unstable/xen/arch/x86/domain.c @@ -1872,6 +1872,7 @@ static int relinquish_memory( while ( (page = page_list_remove_head(list)) ) { +bool log = false;//temp /* Grab a reference to the page so it won't disappear from under us. */ if ( unlikely(!get_page(page, d)) ) { @@ -1880,6 +1881,10 @@ static int relinquish_memory( continue; } +if(is_pv_32bit_domain(d) && PGT_type_equal(page->u.inuse.type_info, PGT_l2_page_table)) {//temp + printk("d%d:%"PRI_mfn": %lx:%d\n", d->domain_id, mfn_x(page_to_mfn(page)), page->u.inuse.type_info, page->linear_pt_count); + log = true; +} if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) ) ret = put_page_and_type_preemptible(page); switch ( ret ) @@ -1921,7 +1926,13 @@ static int relinquish_memory( if ( likely(y == x) ) { /* No need for atomic update of type_info here: noone else updates it. */ - switch ( ret = free_page_type(page, x, 1) ) +//temp switch ( ret = free_page_type(page, x, 1) ) +ret = free_page_type(page, x, 1);//temp +if(log) {//temp + printk("%"PRI_mfn" -> %lx:%d (%d,%d,%d)\n", mfn_x(page_to_mfn(page)), page->u.inuse.type_info, + page->linear_pt_count, ret, page->nr_validated_ptes, page->partial_pte); +} +switch(ret)//temp { case 0: break; --- unstable.orig/xen/arch/x86/mm.c +++ unstable/xen/arch/x86/mm.c @@ -705,12 +705,19 @@ static bool inc_linear_entries(struct pa return true; } -static void dec_linear_entries(struct page_info *pg) +//temp static void dec_linear_entries(struct page_info *pg) +static const struct domain*dec_linear_entries(struct page_info*pg)//temp { typeof(pg->linear_pt_count) oc; oc = arch_fetch_and_add(&pg->linear_pt_count, -1); +{//temp + const struct domain*owner = page_get_owner(pg); + if(oc <= 0 && is_pv_32bit_domain(owner)) + return owner; +} ASSERT(oc > 0); +return NULL;//temp } static bool inc_linear_uses(struct page_info *pg) @@ -2617,8 +2624,15 @@ static int _put_final_page_type(struct p { if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) ) { +const struct domain*d;//temp dec_linear_uses(page); +if((d = ({//temp dec_linear_entries(ptpg); +})) != NULL) {//temp + printk("d%d: %"PRI_mfn":%lx:%d -> %"PRI_mfn":%lx:%d\n", d->domain_id, + mfn_x(page_to_mfn(ptpg)), ptpg->u.inuse.type_info, ptpg->linear_pt_count, + mfn_x(page_to_mfn(page)), page->u.inuse.type_info, page->linear_pt_count); +} } ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying); set_tlbflush_timestamp(page); @@ -2704,8 +2718,15 @@ static int _put_page_type(struct page_in if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) ) { +const struct domain*d;//temp dec_linear_uses(page); +if((d = ({//temp dec_linear_entries(ptpg); +})) != NULL) {//temp + printk("d%d: %"PRI_mfn":%lx:%d => %"PRI_mfn":%lx:%d\n", d->domain_id, + mfn_x(page_to_mfn(ptpg)), ptpg->u.inuse.type_info, ptpg->linear_pt_count, + mfn_x(page_to_mfn(page)), page->u.inuse.type_info, page->linear_pt_count); +} } return 0; _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |