[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH Xen-unstable] Balloon down memory to achive enough DMA32 memory for PV guests with PCI pass-through to succesfully launch.



# HG changeset patch
# User konrad.wilk@xxxxxxxxxx
# Date 1258150318 18000
# Node ID 82762bc10aa5a193173d8a83a5dbada1003bdcd2
# Parent  88adf22e0fe3a77d0be95530b74c3781ffc918f1
Balloon down memory to achive enough DMA32 memory for PV guests with PCI 
pass-through to succesfully launch.

If the user hasn't used dom0_mem= bootup parameter, the privileged domain
usurps all of the memory. During launch of PV guests with PCI pass-through
we ratchet down the memory for the privileged domain to the required memory
for the PV guest. However, for PV guests with PCI pass-through we do not
take into account that the PV guest is going to swap its SWIOTLB memory
for DMA32 memory - in fact, swap 64MB of it. This patch balloon's down
the privileged domain so that there are 64MB of DMA32 memory available.

Note: If 'dom0_mem' is used, the user will probably never encounter this
failure.

P.S.
If you see:
about to get started...

And nothing after that, and xenctx shows

Call Trace:
  [<ffffffff8132cfe3>] __const_udelay+0x1e  <--
  [<ffffffff816b9043>] panic+0x1c0
  [<ffffffff81013335>] xen_swiotlb_fixup+0x123
  [<ffffffff81a05e17>] xen_swiotlb_init_with_default_size+0x9c
  [<ffffffff81a05f91>] xen_swiotlb_init+0x4b
  [<ffffffff81a0ab72>] pci_iommu_alloc+0x86
  [<ffffffff81a22972>] mem_init+0x28
  [<ffffffff813201a9>] sort_extable+0x39
  [<ffffffff819feb90>] start_kernel+0x301
  [<ffffffff819fdf76>] x86_64_start_reservations+0x101
  [<ffffffff81a03cdf>] xen_start_kernel+0x715

Then this is the patch for this.

diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Fri Nov 13 17:10:09 2009 -0500
+++ b/tools/python/xen/lowlevel/xc/xc.c Fri Nov 13 17:11:58 2009 -0500
@@ -1059,6 +1059,7 @@
     int i, j, max_cpu_id;
     uint64_t free_heap;
     PyObject *ret_obj, *node_to_cpu_obj, *node_to_memory_obj;
+    PyObject *node_to_dma32_mem_obj;
     xc_cpu_to_node_t map[MAX_CPU_ID + 1];
     const char *virtcap_names[] = { "hvm", "hvm_directio" };
 
@@ -1128,10 +1129,27 @@
         Py_DECREF(pyint);
     }
 
+    xc_dom_loginit();
+    /* DMA memory. */
+    node_to_dma32_mem_obj = PyList_New(0);
+
+    for ( i = 0; i < info.nr_nodes; i++ )
+    {
+        PyObject *pyint;
+
+        xc_availheap(self->xc_handle, 0, 32, i, &free_heap);
+        xc_dom_printf("Node:%d: DMA32:%ld\n", i, free_heap);
+        pyint = PyInt_FromLong(free_heap / 1024);
+        PyList_Append(node_to_dma32_mem_obj, pyint);
+        Py_DECREF(pyint);
+    }
+
     PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj);
     Py_DECREF(node_to_cpu_obj);
     PyDict_SetItemString(ret_obj, "node_to_memory", node_to_memory_obj);
     Py_DECREF(node_to_memory_obj);
+    PyDict_SetItemString(ret_obj, "node_to_dma32_mem", node_to_dma32_mem_obj);
+    Py_DECREF(node_to_dma32_mem_obj);
  
     return ret_obj;
 #undef MAX_CPU_ID
diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendConfig.py
--- a/tools/python/xen/xend/XendConfig.py       Fri Nov 13 17:10:09 2009 -0500
+++ b/tools/python/xen/xend/XendConfig.py       Fri Nov 13 17:11:58 2009 -0500
@@ -2111,6 +2111,13 @@
     def is_hap(self):
         return self['platform'].get('hap', 0)
 
+    def is_pv_and_has_pci(self):
+        for dev_type, dev_info in self.all_devices_sxpr():
+            if dev_type != 'pci':
+                continue
+            return not self.is_hvm()
+        return False
+
     def update_platform_pci(self):
         pci = []
         for dev_type, dev_info in self.all_devices_sxpr():
diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendDomainInfo.py
--- a/tools/python/xen/xend/XendDomainInfo.py   Fri Nov 13 17:10:09 2009 -0500
+++ b/tools/python/xen/xend/XendDomainInfo.py   Fri Nov 13 17:11:58 2009 -0500
@@ -2580,7 +2580,8 @@
 
 
     def _setCPUAffinity(self):
-        """ Repin domain vcpus if a restricted cpus list is provided
+        """ Repin domain vcpus if a restricted cpus list is provided.
+            Returns the choosen node number.
         """
 
         def has_cpus():
@@ -2597,6 +2598,7 @@
                         return True
             return False
 
+        index = 0
         if has_cpumap():
             for v in range(0, self.info['VCPUs_max']):
                 if self.info['vcpus_params'].has_key('cpumap%i' % v):
@@ -2647,6 +2649,54 @@
                 cpumask = info['node_to_cpu'][index]
                 for v in range(0, self.info['VCPUs_max']):
                     xc.vcpu_setaffinity(self.domid, v, cpumask)
+        return index
+
+    def _freeDMAmemory(self, node):
+
+       # If we are PV and have PCI devices the guest will
+       # turn on a SWIOTLB. The SWIOTLB _MUST_ be located in the DMA32
+       # zone (under 4GB). To do so, we need to balloon down Dom0 to where
+       # there is enough (64MB) memory under the 4GB mark. This balloon-ing
+       # might take more memory out than just 64MB thought :-(
+       if not self.info.is_pv_and_has_pci():
+               return
+
+       retries = 2000
+       ask_for_mem = 0;
+       need_mem = 0
+       try:            
+           while (retries > 0):
+               physinfo = xc.physinfo()
+               free_mem = physinfo['free_memory']
+               nr_nodes = physinfo['nr_nodes']
+               node_to_dma32_mem = physinfo['node_to_dma32_mem']
+               if (node > nr_nodes):
+                    return;
+               # Extra 2MB above 64GB seems to do the trick.
+               need_mem = 64 * 1024 + 2048 - node_to_dma32_mem[node]
+               # our starting point. We ask just for the difference to
+               # be have an extra 64MB under 4GB.
+               ask_for_mem = max(need_mem, ask_for_mem);
+               if (need_mem > 0):
+                    log.debug('_freeDMAmemory (%d) Need %dKiB DMA memory. '
+                              'Asking for %dKiB', retries, need_mem,
+                              ask_for_mem)
+
+                    balloon.free(ask_for_mem, self)
+                    ask_for_mem = ask_for_mem + 2048;
+               else:
+                    # OK. We got enough DMA memory.
+                    break
+               retries  = retries - 1
+       except:
+           # This is best-try after all.
+           need_mem = max(1, need_mem);
+           pass
+
+       if (need_mem > 0):
+           log.warn('We tried our best to balloon down DMA memory to '
+                    'accomodate your PV guest. We need %dKiB extra memory.',
+                    need_mem)
 
     def _setSchedParams(self):
         if XendNode.instance().xenschedinfo() == 'credit':
@@ -2668,7 +2718,7 @@
             # repin domain vcpus if a restricted cpus list is provided
             # this is done prior to memory allocation to aide in memory
             # distribution for NUMA systems.
-            self._setCPUAffinity()
+            node = self._setCPUAffinity()
 
             # Set scheduling parameters.
             self._setSchedParams()
@@ -2730,6 +2780,8 @@
             if self.info.target():
                 self._setTarget(self.info.target())
 
+            self._freeDMAmemory(node)
+
             self._createDevices()
 
             self.image.cleanupTmpImages()
diff -r 88adf22e0fe3 -r 82762bc10aa5 tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Fri Nov 13 17:10:09 2009 -0500
+++ b/tools/python/xen/xend/XendNode.py Fri Nov 13 17:11:58 2009 -0500
@@ -872,11 +872,11 @@
         except:
             str='none\n'
         return str[:-1];
-    def format_node_to_memory(self, pinfo):
+    def format_node_to_memory(self, pinfo, key):
         str=''
         whitespace=''
         try:
-            node_to_memory=pinfo['node_to_memory']
+            node_to_memory=pinfo[key]
             for i in range(0, pinfo['nr_nodes']):
                 str+='%snode%d:%d\n' % (whitespace,
                                         i,
@@ -896,7 +896,10 @@
         info['total_memory'] = info['total_memory'] / 1024
         info['free_memory']  = info['free_memory'] / 1024
         info['node_to_cpu']  = self.format_node_to_cpu(info)
-        info['node_to_memory'] = self.format_node_to_memory(info)
+        info['node_to_memory'] = self.format_node_to_memory(info,
+                                       'node_to_memory')
+        info['node_to_dma32_mem'] = self.format_node_to_memory(info,
+                                       'node_to_dma32_mem')
 
         ITEM_ORDER = ['nr_cpus',
                       'nr_nodes',
@@ -908,7 +911,8 @@
                       'total_memory',
                       'free_memory',
                       'node_to_cpu',
-                      'node_to_memory'
+                      'node_to_memory',
+                      'node_to_dma32_mem'
                       ]
 
         return [[k, info[k]] for k in ITEM_ORDER]

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.