[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-devel] [PATCH] 2/2: cpufreq/PowerNow! in Xen: PowerNow! changes



Enable cpufreq support in Xen for AMD Operton processors by:

1) Allowing the PowerNow! driver in dom0 to write to the PowerNow!
MSRs.
2) Adding the cpufreq notifier chain to time-xen.c in dom0.
On a frequency change, a platform hypercall is performed to
scale the frequency multiplier in the hypervisor.
3) Adding a platform hypercall to the hypervisor the scale
the frequency multiplier and reset the time stamps so that
next calibration remains reasonably correct.

Patch 1 covers the frequency scaling platform call.
Patch 2 covers the changes necessary to the PowerNow! driver
to make it correctly associate shared cores under Xen and to
write to MSRs.  Most of this patch modifies the PowerNow!
driver to correctly use the _PSD structure to determine
pstate domains; a similar patch is upstream for the Linux
kernel.

This code can be readily expanded to cover Intel or other
non-AMD processors by modifying xen/arch/x8/traps.c to
allow the appropriate MSR accesses.

Caveat: currently, this code does not support the in-kernel
ondemand cpufreq governor.  Dom0 must run a userspace 
daemon to monitor the utilization of the physical cpus
with the getcpuinfo sysctl hypercall.

Caveat 2: on SMP systems, dom0_vcpus_pin is strongly
advised.

Caveat 3: Even though the clock multipliers are being
scaled and recorded correctly in both dom0 and the
hypervisor, time errors appear immediately after a
frequency change.  They are not more likely when
the frequency is constant.


Signed-off-by: Mark Langsdorf <mark.langsdorf@xxxxxxx>

diff -r 05c22f282023 arch/i386/kernel/cpu/cpufreq/powernow-k8.c
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c        Tue Aug 14 16:20:55 
2007 +0100
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c        Tue Aug 28 14:55:24 
2007 -0500
@@ -738,6 +738,7 @@ static int find_psb_table(struct powerno
 
                data->numps = psb->numps;
                dprintk("numpstates: 0x%x\n", data->numps);
+               data->starting_core_affinity = cpumask_of_cpu(0);
                return fill_powernow_table(data, (struct pst_s *)(psb+1), 
maxvid);
        }
        /*
@@ -758,15 +759,43 @@ static int find_psb_table(struct powerno
 #ifdef CONFIG_X86_POWERNOW_K8_ACPI
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 
unsigned int index)
 {
-       if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
+       if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE))
                return;
 
-       data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & 
IRT_MASK;
-       data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & 
RVO_MASK;
-       data->exttype = (data->acpi_data.states[index].control >> 
EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
-       data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) 
& PLL_L_MASK;
-       data->vidmvs = 1 << ((data->acpi_data.states[index].control >> 
MVS_SHIFT) & MVS_MASK);
-       data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & 
VST_MASK;
+       data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & 
IRT_MASK;
+       data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & 
RVO_MASK;
+       data->exttype = (data->acpi_data->states[index].control >> 
EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+       data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) 
& PLL_L_MASK;
+       data->vidmvs = 1 << ((data->acpi_data->states[index].control >> 
MVS_SHIFT) & MVS_MASK);
+       data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & 
VST_MASK;
+}
+
+static struct acpi_processor_performance *acpi_perf_data[NR_CPUS];
+static int preregister_valid = 0;
+
+static int powernow_k8_cpu_preinit_acpi()
+{
+       int i; 
+       struct acpi_processor_performance *data;
+       for_each_possible_cpu(i) {
+               data = kzalloc(sizeof(struct acpi_processor_performance),
+                               GFP_KERNEL);
+               if (!data) {
+                       int j;
+                       for_each_possible_cpu(j) {
+                               kfree(acpi_perf_data[j]);
+                               acpi_perf_data[j] = NULL;
+                       }
+                       return -ENODEV;
+               }
+               acpi_perf_data[i] = data;
+       }
+
+       if (acpi_processor_preregister_performance(acpi_perf_data))
+               return -ENODEV;
+       else 
+               preregister_valid = 1;
+       return 0;
 }
 
 static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
@@ -774,28 +803,29 @@ static int powernow_k8_cpu_init_acpi(str
        struct cpufreq_frequency_table *powernow_table;
        int ret_val;
 
-       if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
+       data->acpi_data = acpi_perf_data[data->cpu];
+       if (acpi_processor_register_performance(data->acpi_data, data->cpu)) {
                dprintk("register performance failed: bad ACPI data\n");
                return -EIO;
        }
 
        /* verify the data contained in the ACPI structures */
-       if (data->acpi_data.state_count <= 1) {
+       if (data->acpi_data->state_count <= 1) {
                dprintk("No ACPI P-States\n");
                goto err_out;
        }
 
-       if ((data->acpi_data.control_register.space_id != 
ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-               (data->acpi_data.status_register.space_id != 
ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+       if ((data->acpi_data->control_register.space_id != 
ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+               (data->acpi_data->status_register.space_id != 
ACPI_ADR_SPACE_FIXED_HARDWARE)) {
                dprintk("Invalid control/status registers (%x - %x)\n",
-                       data->acpi_data.control_register.space_id,
-                       data->acpi_data.status_register.space_id);
+                       data->acpi_data->control_register.space_id,
+                       data->acpi_data->status_register.space_id);
                goto err_out;
        }
 
        /* fill in data->powernow_table */
        powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-               * (data->acpi_data.state_count + 1)), GFP_KERNEL);
+               * (data->acpi_data->state_count + 1)), GFP_KERNEL);
        if (!powernow_table) {
                dprintk("powernow_table memory alloc failure\n");
                goto err_out;
@@ -808,28 +838,43 @@ static int powernow_k8_cpu_init_acpi(str
        if (ret_val)
                goto err_out_mem;
 
-       powernow_table[data->acpi_data.state_count].frequency = 
CPUFREQ_TABLE_END;
-       powernow_table[data->acpi_data.state_count].index = 0;
+       powernow_table[data->acpi_data->state_count].frequency = 
CPUFREQ_TABLE_END;
+       powernow_table[data->acpi_data->state_count].index = 0;
        data->powernow_table = powernow_table;
 
        /* fill in data */
-       data->numps = data->acpi_data.state_count;
+       data->numps = data->acpi_data->state_count;
        print_basics(data);
        powernow_k8_acpi_pst_values(data, 0);
 
        /* notify BIOS that we exist */
        acpi_processor_notify_smm(THIS_MODULE);
 
+       /* determine affinity, from ACPI if available */
+       if (preregister_valid) {
+               if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) ||
+                   (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY))
+                       data->starting_core_affinity = 
data->acpi_data->shared_cpu_map;
+               else
+                       data->starting_core_affinity = 
cpumask_of_cpu(data->cpu);
+       } else {
+               /* best guess from family if not */
+               if (cpu_family == CPU_HW_PSTATE)
+                       data->starting_core_affinity = 
cpumask_of_cpu(data->cpu);
+               else
+                       data->starting_core_affinity = cpu_core_map[data->cpu];
+       }
+
        return 0;
 
 err_out_mem:
        kfree(powernow_table);
 
 err_out:
-       acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
-       /* data->acpi_data.state_count informs us at ->exit() whether ACPI was 
used */
-       data->acpi_data.state_count = 0;
+       acpi_processor_unregister_performance(data->acpi_data, data->cpu);
+
+       /* data->acpi_data->state_count informs us at ->exit() whether ACPI was 
used */
+       data->acpi_data->state_count = 0;
 
        return -ENODEV;
 }
@@ -838,13 +883,13 @@ static int fill_powernow_table_pstate(st
 {
        int i;
 
-       for (i = 0; i < data->acpi_data.state_count; i++) {
+       for (i = 0; i < data->acpi_data->state_count; i++) {
                u32 index;
                u32 hi = 0, lo = 0;
                u32 fid;
                u32 did;
 
-               index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
+               index = data->acpi_data->states[i].control & HW_PSTATE_MASK;
                if (index > MAX_HW_PSTATE) {
                        printk(KERN_ERR PFX "invalid pstate %d - bad value 
%d.\n", i, index);
                        printk(KERN_ERR PFX "Please report to BIOS 
manufacturer\n");
@@ -865,10 +910,10 @@ static int fill_powernow_table_pstate(st
 
                powernow_table[i].frequency = find_khz_freq_from_fiddid(fid, 
did);
 
-               if (powernow_table[i].frequency != 
(data->acpi_data.states[i].core_frequency * 1000)) {
+               if (powernow_table[i].frequency != 
(data->acpi_data->states[i].core_frequency * 1000)) {
                        printk(KERN_INFO PFX "invalid freq entries %u kHz vs. 
%u kHz\n",
                                powernow_table[i].frequency,
-                               (unsigned int) 
(data->acpi_data.states[i].core_frequency * 1000));
+                               (unsigned int) 
(data->acpi_data->states[i].core_frequency * 1000));
                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
                        continue;
                }
@@ -880,16 +925,16 @@ static int fill_powernow_table_fidvid(st
 {
        int i;
        int cntlofreq = 0;
-       for (i = 0; i < data->acpi_data.state_count; i++) {
+       for (i = 0; i < data->acpi_data->state_count; i++) {
                u32 fid;
                u32 vid;
 
                if (data->exttype) {
-                       fid = data->acpi_data.states[i].status & EXT_FID_MASK;
-                       vid = (data->acpi_data.states[i].status >> VID_SHIFT) & 
EXT_VID_MASK;
+                       fid = data->acpi_data->states[i].status & EXT_FID_MASK;
+                       vid = (data->acpi_data->states[i].status >> VID_SHIFT) 
& EXT_VID_MASK;
                } else {
-                       fid = data->acpi_data.states[i].control & FID_MASK;
-                       vid = (data->acpi_data.states[i].control >> VID_SHIFT) 
& VID_MASK;
+                       fid = data->acpi_data->states[i].control & FID_MASK;
+                       vid = (data->acpi_data->states[i].control >> VID_SHIFT) 
& VID_MASK;
                }
 
                dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
@@ -930,10 +975,10 @@ static int fill_powernow_table_fidvid(st
                                cntlofreq = i;
                }
 
-               if (powernow_table[i].frequency != 
(data->acpi_data.states[i].core_frequency * 1000)) {
+               if (powernow_table[i].frequency != 
(data->acpi_data->states[i].core_frequency * 1000)) {
                        printk(KERN_INFO PFX "invalid freq entries %u kHz vs. 
%u kHz\n",
                                powernow_table[i].frequency,
-                               (unsigned int) 
(data->acpi_data.states[i].core_frequency * 1000));
+                               (unsigned int) 
(data->acpi_data->states[i].core_frequency * 1000));
                        powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID;
                        continue;
                }
@@ -943,14 +988,15 @@ static int fill_powernow_table_fidvid(st
 
 static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
 {
-       if (data->acpi_data.state_count)
-               acpi_processor_unregister_performance(&data->acpi_data, 
data->cpu);
+       if (data->acpi_data->state_count)
+               acpi_processor_unregister_performance(data->acpi_data, 
data->cpu);
 }
 
 #else
 static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return 
-ENODEV; }
 static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; 
}
 static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 
unsigned int index) { return; }
+static int powernow_k8_cpu_preinit_acpi() { return -ENODEV; }
 #endif /* CONFIG_X86_POWERNOW_K8_ACPI */
 
 /* Take a frequency, and issue the fid/vid transition command */
@@ -1164,7 +1210,7 @@ static int __cpuinit powernowk8_cpu_init
                 * an UP version, and is deprecated by AMD.
                 */
                if (num_online_cpus() != 1) {
-                       printk(KERN_ERR PFX "MP systems not supported by PSB 
BIOS structure\n");
+                       printk(KERN_ERR PFX "Your BIOS does not provide _PSS 
objects.  PowerNow! does not work on SMP systems without _PSS objects.  
Complain to your BIOS vendor.\n");
                        kfree(data);
                        return -ENODEV;
                }
@@ -1204,10 +1250,7 @@ static int __cpuinit powernowk8_cpu_init
        set_cpus_allowed(current, oldmask);
 
        pol->governor = CPUFREQ_DEFAULT_GOVERNOR;
-       if (cpu_family == CPU_HW_PSTATE)
-               pol->cpus = cpumask_of_cpu(pol->cpu);
-       else
-               pol->cpus = cpu_core_map[pol->cpu];
+       pol->cpus = data->starting_core_affinity;
        data->available_cores = &(pol->cpus);
 
        /* Take a crude guess here.
@@ -1323,6 +1366,7 @@ static int __cpuinit powernowk8_init(voi
        }
 
        if (supported_cpus == num_online_cpus()) {
+               powernow_k8_cpu_preinit_acpi();
                printk(KERN_INFO PFX "Found %d %s "
                        "processors (" VERSION ")\n", supported_cpus,
                        boot_cpu_data.x86_model_id);
diff -r 05c22f282023 arch/i386/kernel/cpu/cpufreq/powernow-k8.h
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h        Tue Aug 14 16:20:55 
2007 +0100
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h        Tue Aug 28 14:55:24 
2007 -0500
@@ -32,12 +32,13 @@ struct powernow_k8_data {
 #ifdef CONFIG_X86_POWERNOW_K8_ACPI
        /* the acpi table needs to be kept. it's only available if ACPI was
         * used to determine valid frequency/vid/fid states */
-       struct acpi_processor_performance acpi_data;
+       struct acpi_processor_performance *acpi_data;
 #endif
        /* we need to keep track of associated cores, but let cpufreq
         * handle hotplug events - so just point at cpufreq pol->cpus
         * structure */
        cpumask_t *available_cores;
+       cpumask_t starting_core_affinity;
 };
diff -r 256160ff19b7 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Thu Aug 16 13:27:59 2007 +0100
+++ b/xen/arch/x86/traps.c      Wed Aug 29 17:10:06 2007 -0500
@@ -1724,6 +1724,15 @@ static int emulate_privileged_op(struct 
             v->arch.guest_context.gs_base_user = res;
             break;
 #endif
+       case MSR_K8_FIDVID_STATUS:
+       case MSR_K8_FIDVID_CTL:
+           if ( IS_COMPAT(v->domain) )
+               goto fail;
+           if ( wrmsr_safe(regs->ecx, regs->eax, regs->edx) )
+               goto fail;
+            v->arch.guest_context.gs_base_user =
+               ((u64)regs->edx << 32) | regs->eax;
+           break;
         default:
             if ( wrmsr_hypervisor_regs(regs->ecx, eax, edx) )
                 break;
@@ -1760,6 +1769,13 @@ static int emulate_privileged_op(struct 
             regs->edx = v->arch.guest_context.gs_base_user >> 32;
             break;
 #endif
+       case MSR_K8_FIDVID_CTL:
+       case MSR_K8_FIDVID_STATUS:
+           if ( IS_COMPAT(v->domain) )
+               goto fail;
+            if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
+               goto fail;
+           break;
         case MSR_EFER:
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
                 goto fail;
diff -r 256160ff19b7 xen/include/asm-x86/msr.h
--- a/xen/include/asm-x86/msr.h Thu Aug 16 13:27:59 2007 +0100
+++ b/xen/include/asm-x86/msr.h Wed Aug 29 17:10:06 2007 -0500
@@ -357,6 +357,9 @@ static inline void write_efer(__u64 val)
 #define MSR_K8_VM_CR                   0xC0010114
 #define MSR_K8_VM_HSAVE_PA             0xC0010117
 
+#define MSR_K8_FIDVID_CTL              0xC0010041
+#define MSR_K8_FIDVID_STATUS           0xC0010042
+
 /* MSR_K8_VM_CR bits: */
 #define _K8_VMCR_SVME_DISABLE          4
 #define K8_VMCR_SVME_DISABLE           (1 << _K8_VMCR_SVME_DISABLE)



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel


 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.