Xen project Mailing List

[Xen-devel] [PATCH Xen-unstable] Balloon down memory to achive enough DMA32 memory for PV guests with PCI pass-through to succesfully launch.

To: keir.fraser@xxxxxxxxxxxxx, xen-devel@xxxxxxxxxxxxxxxxxxx

From: Konrad Rzeszutek Wilk <konrad.wilk@xxxxxxxxxx>

Date: Fri, 13 Nov 2009 17:16:02 -0500

Cc:

Delivery-date: Fri, 13 Nov 2009 14:18:04 -0800

List-id: Xen developer discussion <xen-devel.lists.xensource.com>

# HG changeset patch # User konrad.wilk@xxxxxxxxxx # Date 1258150318 18000 # Node ID 82762bc10aa5a193173d8a83a5dbada1003bdcd2 # Parent 88adf22e0fe3a77d0be95530b74c3781ffc918f1 Balloon down memory to achive enough DMA32 memory for PV guests with PCI pass-through to succesfully launch. If the user hasn't used dom0_mem= bootup parameter, the privileged domain usurps all of the memory. During launch of PV guests with PCI pass-through we ratchet down the memory for the privileged domain to the required memory for the PV guest. However, for PV guests with PCI pass-through we do not take into account that the PV guest is going to swap its SWIOTLB memory for DMA32 memory - in fact, swap 64MB of it. This patch balloon's down the privileged domain so that there are 64MB of DMA32 memory available. Note: If 'dom0_mem' is used, the user will probably never encounter this failure. P.S. If you see: about to get started... And nothing after that, and xenctx shows Call Trace: [<ffffffff8132cfe3>] __const_udelay+0x1e <-- [<ffffffff816b9043>] panic+0x1c0 [<ffffffff81013335>] xen_swiotlb_fixup+0x123 [<ffffffff81a05e17>] xen_swiotlb_init_with_default_size+0x9c [<ffffffff81a05f91>] xen_swiotlb_init+0x4b [<ffffffff81a0ab72>] pci_iommu_alloc+0x86 [<ffffffff81a22972>] mem_init+0x28 [<ffffffff813201a9>] sort_extable+0x39 [<ffffffff819feb90>] start_kernel+0x301 [<ffffffff819fdf76>] x86_64_start_reservations+0x101 [<ffffffff81a03cdf>] xen_start_kernel+0x715 Then this is the patch for this. diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/lowlevel/xc/xc.c --- a/tools/python/xen/lowlevel/xc/xc.c Fri Nov 13 17:10:09 2009 -0500 +++ b/tools/python/xen/lowlevel/xc/xc.c Fri Nov 13 17:11:58 2009 -0500 @@ -1059,6 +1059,7 @@ int i, j, max_cpu_id; uint64_t free_heap; PyObject *ret_obj, *node_to_cpu_obj, *node_to_memory_obj; + PyObject *node_to_dma32_mem_obj; xc_cpu_to_node_t map[MAX_CPU_ID + 1]; const char *virtcap_names[] = { "hvm", "hvm_directio" }; @@ -1128,10 +1129,27 @@ Py_DECREF(pyint); } + xc_dom_loginit(); + /* DMA memory. */ + node_to_dma32_mem_obj = PyList_New(0); + + for ( i = 0; i < info.nr_nodes; i++ ) + { + PyObject *pyint; + + xc_availheap(self->xc_handle, 0, 32, i, &free_heap); + xc_dom_printf("Node:%d: DMA32:%ld\n", i, free_heap); + pyint = PyInt_FromLong(free_heap / 1024); + PyList_Append(node_to_dma32_mem_obj, pyint); + Py_DECREF(pyint); + } + PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj); Py_DECREF(node_to_cpu_obj); PyDict_SetItemString(ret_obj, "node_to_memory", node_to_memory_obj); Py_DECREF(node_to_memory_obj); + PyDict_SetItemString(ret_obj, "node_to_dma32_mem", node_to_dma32_mem_obj); + Py_DECREF(node_to_dma32_mem_obj); return ret_obj; #undef MAX_CPU_ID diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendConfig.py --- a/tools/python/xen/xend/XendConfig.py Fri Nov 13 17:10:09 2009 -0500 +++ b/tools/python/xen/xend/XendConfig.py Fri Nov 13 17:11:58 2009 -0500 @@ -2111,6 +2111,13 @@ def is_hap(self): return self['platform'].get('hap', 0) + def is_pv_and_has_pci(self): + for dev_type, dev_info in self.all_devices_sxpr(): + if dev_type != 'pci': + continue + return not self.is_hvm() + return False + def update_platform_pci(self): pci = [] for dev_type, dev_info in self.all_devices_sxpr(): diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendDomainInfo.py --- a/tools/python/xen/xend/XendDomainInfo.py Fri Nov 13 17:10:09 2009 -0500 +++ b/tools/python/xen/xend/XendDomainInfo.py Fri Nov 13 17:11:58 2009 -0500 @@ -2580,7 +2580,8 @@ def _setCPUAffinity(self): - """ Repin domain vcpus if a restricted cpus list is provided + """ Repin domain vcpus if a restricted cpus list is provided. + Returns the choosen node number. """ def has_cpus(): @@ -2597,6 +2598,7 @@ return True return False + index = 0 if has_cpumap(): for v in range(0, self.info['VCPUs_max']): if self.info['vcpus_params'].has_key('cpumap%i' % v): @@ -2647,6 +2649,54 @@ cpumask = info['node_to_cpu'][index] for v in range(0, self.info['VCPUs_max']): xc.vcpu_setaffinity(self.domid, v, cpumask) + return index + + def _freeDMAmemory(self, node): + + # If we are PV and have PCI devices the guest will + # turn on a SWIOTLB. The SWIOTLB _MUST_ be located in the DMA32 + # zone (under 4GB). To do so, we need to balloon down Dom0 to where + # there is enough (64MB) memory under the 4GB mark. This balloon-ing + # might take more memory out than just 64MB thought :-( + if not self.info.is_pv_and_has_pci(): + return + + retries = 2000 + ask_for_mem = 0; + need_mem = 0 + try: + while (retries > 0): + physinfo = xc.physinfo() + free_mem = physinfo['free_memory'] + nr_nodes = physinfo['nr_nodes'] + node_to_dma32_mem = physinfo['node_to_dma32_mem'] + if (node > nr_nodes): + return; + # Extra 2MB above 64GB seems to do the trick. + need_mem = 64 * 1024 + 2048 - node_to_dma32_mem[node] + # our starting point. We ask just for the difference to + # be have an extra 64MB under 4GB. + ask_for_mem = max(need_mem, ask_for_mem); + if (need_mem > 0): + log.debug('_freeDMAmemory (%d) Need %dKiB DMA memory. ' + 'Asking for %dKiB', retries, need_mem, + ask_for_mem) + + balloon.free(ask_for_mem, self) + ask_for_mem = ask_for_mem + 2048; + else: + # OK. We got enough DMA memory. + break + retries = retries - 1 + except: + # This is best-try after all. + need_mem = max(1, need_mem); + pass + + if (need_mem > 0): + log.warn('We tried our best to balloon down DMA memory to ' + 'accomodate your PV guest. We need %dKiB extra memory.', + need_mem) def _setSchedParams(self): if XendNode.instance().xenschedinfo() == 'credit': @@ -2668,7 +2718,7 @@ # repin domain vcpus if a restricted cpus list is provided # this is done prior to memory allocation to aide in memory # distribution for NUMA systems. - self._setCPUAffinity() + node = self._setCPUAffinity() # Set scheduling parameters. self._setSchedParams() @@ -2730,6 +2780,8 @@ if self.info.target(): self._setTarget(self.info.target()) + self._freeDMAmemory(node) + self._createDevices() self.image.cleanupTmpImages() diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendNode.py --- a/tools/python/xen/xend/XendNode.py Fri Nov 13 17:10:09 2009 -0500 +++ b/tools/python/xen/xend/XendNode.py Fri Nov 13 17:11:58 2009 -0500 @@ -872,11 +872,11 @@ except: str='none\n' return str[:-1]; - def format_node_to_memory(self, pinfo): + def format_node_to_memory(self, pinfo, key): str='' whitespace='' try: - node_to_memory=pinfo['node_to_memory'] + node_to_memory=pinfo[key] for i in range(0, pinfo['nr_nodes']): str+='%snode%d:%d\n' % (whitespace, i, @@ -896,7 +896,10 @@ info['total_memory'] = info['total_memory'] / 1024 info['free_memory'] = info['free_memory'] / 1024 info['node_to_cpu'] = self.format_node_to_cpu(info) - info['node_to_memory'] = self.format_node_to_memory(info) + info['node_to_memory'] = self.format_node_to_memory(info, + 'node_to_memory') + info['node_to_dma32_mem'] = self.format_node_to_memory(info, + 'node_to_dma32_mem') ITEM_ORDER = ['nr_cpus', 'nr_nodes', @@ -908,7 +911,8 @@ 'total_memory', 'free_memory', 'node_to_cpu', - 'node_to_memory' + 'node_to_memory', + 'node_to_dma32_mem' ] return [[k, info[k]] for k in ITEM_ORDER] _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxx http://lists.xensource.com/xen-devel

©2013 Xen Project, A Linux Foundation Collaborative Project. All Rights Reserved.
Linux Foundation is a registered trademark of The Linux Foundation.
Xen Project is a trademark of The Linux Foundation.