[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] Re: [PATCH] x86: retrieve and log CPU frequency information
On Wed, Apr 15, 2020 at 01:55:24PM +0200, Jan Beulich wrote: > While from just a single Skylake system it is already clear that we > can't base any of our logic on CPUID leaf 15 [1] (leaf 16 is > documented to be used for display purposes only anyway), logging this > information may still give us some reference in case of problems as well > as for future work. Additionally on the AMD side it is unclear whether > the deviation between reported and measured frequencies is because of us > not doing well, or because of nominal and actual frequencies being quite > far apart. Can you add some reference to the AMD implementation? I've looked at the PMs and haven't been able to find a description of some of the MSRs, like 0xC0010064. > The chosen variable naming in amd_log_freq() has pointed out a naming > problem in rdmsr_safe(), which is being taken care of at the same time. > Symmetrically wrmsr_safe(), being an inline function, also gets an > unnecessary underscore dropped from one of its local variables. > > [1] With a core crystal clock of 24MHz and a ratio of 216/2, the > reported frequency nevertheless is 2600MHz, rather than the to be > expected (and calibrated by both us and Linux) 2592MHz. > > Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx> > Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx> > --- > TBD: The node ID retrieval using extended leaf 1E implies it won't work > on older hardware (pre-Fam15 I think). Besides the Node ID MSR, > which doesn't get advertised on my Fam10 box (and it's zero on all > processors despite there being two nodes as per the PCI device > map), and which isn't even documented for Fam11, Fam12, and Fam14, > I didn't find any other means to retrieve the node ID a CPU is > associated with - the NodeId register in PCI config space depends > on one already knowing the node ID for doing the access, as the > device to be used is a function of the node ID. > > --- a/xen/arch/x86/cpu/amd.c > +++ b/xen/arch/x86/cpu/amd.c > @@ -532,6 +532,102 @@ static void amd_get_topology(struct cpui > : c->cpu_core_id); > } > > +void amd_log_freq(const struct cpuinfo_x86 *c) > +{ > + unsigned int idx = 0, h; > + uint64_t hi, lo, val; > + > + if (c->x86 < 0x10 || c->x86 > 0x19 || > + (c != &boot_cpu_data && > + (!opt_cpu_info || (c->apicid & (c->x86_num_siblings - 1))))) > + return; > + > + if (c->x86 < 0x17) { > + unsigned int node = 0; > + uint64_t nbcfg; > + > + /* > + * Make an attempt at determining the node ID, but assume > + * symmetric setup (using node 0) if this fails. > + */ > + if (c->extended_cpuid_level >= 0x8000001e && > + cpu_has(c, X86_FEATURE_TOPOEXT)) { > + node = cpuid_ecx(0x8000001e) & 0xff; > + if (node > 7) > + node = 0; > + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { > + rdmsrl(0xC001100C, val); > + node = val & 7; > + } > + > + /* > + * Enable (and use) Extended Config Space accesses, as we > + * can't be certain that MCFG is available here during boot. > + */ > + rdmsrl(MSR_AMD64_NB_CFG, nbcfg); > + wrmsrl(MSR_AMD64_NB_CFG, > + nbcfg | (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)); > +#define PCI_ECS_ADDRESS(sbdf, reg) \ > + (0x80000000 | ((sbdf).bdf << 8) | ((reg) & 0xfc) | (((reg) & 0xf00) << > 16)) > + > + for ( ; ; ) { > + pci_sbdf_t sbdf = PCI_SBDF(0, 0, 0x18 | node, 4); > + > + switch (pci_conf_read32(sbdf, PCI_VENDOR_ID)) { > + case 0x00000000: > + case 0xffffffff: > + /* No device at this SBDF. */ > + if (!node) > + break; > + node = 0; > + continue; > + > + default: > + /* > + * Core Performance Boost Control, family > + * dependent up to 3 bits starting at bit 2. > + */ > + switch (c->x86) { > + case 0x10: idx = 1; break; > + case 0x12: idx = 7; break; > + case 0x14: idx = 7; break; > + case 0x15: idx = 7; break; > + case 0x16: idx = 7; break; > + } > + idx &= pci_conf_read(PCI_ECS_ADDRESS(sbdf, > + 0x15c), > + 0, 4) >> 2; > + break; > + } > + break; > + } > + > +#undef PCI_ECS_ADDRESS > + wrmsrl(MSR_AMD64_NB_CFG, nbcfg); > + } > + > + lo = 0; /* gcc may not recognize the loop having at least 5 iterations > */ > + for (h = c->x86 == 0x10 ? 5 : 8; h--; ) > + if (!rdmsr_safe(0xC0010064 + h, lo) && (lo >> 63)) > + break; > + if (!(lo >> 63)) > + return; > + > +#define FREQ(v) (c->x86 < 0x17 ? ((((v) & 0x3f) + 0x10) * 100) >> (((v) >> > 6) & 7) \ > + : (((v) & 0xff) * 25 * 8) / (((v) >> 8) & > 0x3f)) > + if (idx && idx < h && > + !rdmsr_safe(0xC0010064 + idx, val) && (val >> 63) && > + !rdmsr_safe(0xC0010064, hi) && (hi >> 63)) > + printk("CPU%u: %lu (%lu..%lu) MHz\n", > + smp_processor_id(), FREQ(val), FREQ(lo), FREQ(hi)); > + else if (h && !rdmsr_safe(0xC0010064, hi) && (hi >> 63)) > + printk("CPU%u: %lu..%lu MHz\n", > + smp_processor_id(), FREQ(lo), FREQ(hi)); > + else > + printk("CPU%u: %lu MHz\n", smp_processor_id(), FREQ(lo)); > +#undef FREQ > +} > + > void early_init_amd(struct cpuinfo_x86 *c) > { > if (c == &boot_cpu_data) > @@ -803,6 +899,8 @@ static void init_amd(struct cpuinfo_x86 > disable_c1_ramping(); > > check_syscfg_dram_mod_en(); > + > + amd_log_freq(c); > } > > const struct cpu_dev amd_cpu_dev = { > --- a/xen/arch/x86/cpu/cpu.h > +++ b/xen/arch/x86/cpu/cpu.h > @@ -19,3 +19,4 @@ extern void detect_ht(struct cpuinfo_x86 > extern bool detect_extended_topology(struct cpuinfo_x86 *c); > > void early_init_amd(struct cpuinfo_x86 *c); > +void amd_log_freq(const struct cpuinfo_x86 *c); > --- a/xen/arch/x86/cpu/hygon.c > +++ b/xen/arch/x86/cpu/hygon.c > @@ -99,6 +99,8 @@ static void init_hygon(struct cpuinfo_x8 > value |= (1 << 27); /* Enable read-only APERF/MPERF bit */ > wrmsrl(MSR_K7_HWCR, value); > } > + > + amd_log_freq(c); > } > > const struct cpu_dev hygon_cpu_dev = { > --- a/xen/arch/x86/cpu/intel.c > +++ b/xen/arch/x86/cpu/intel.c > @@ -378,6 +378,72 @@ static void init_intel(struct cpuinfo_x8 > ( c->cpuid_level >= 0x00000006 ) && > ( cpuid_eax(0x00000006) & (1u<<2) ) ) > __set_bit(X86_FEATURE_ARAT, c->x86_capability); > + I would split this into a separate helper, ie: intel_log_freq. That will allow you to exit early and reduce some of the indentation IMO. > + if ( (opt_cpu_info && !(c->apicid & (c->x86_num_siblings - 1))) || > + c == &boot_cpu_data ) > + { > + unsigned int eax, ebx, ecx, edx; > + uint64_t msrval; > + > + if ( c->cpuid_level >= 0x15 ) > + { > + cpuid(0x15, &eax, &ebx, &ecx, &edx); > + if ( ecx && ebx && eax ) > + { > + unsigned long long val = ecx; > + > + val *= ebx; > + do_div(val, eax); > + printk("CPU%u: TSC: %uMHz * %u / %u = %LuMHz\n", > + smp_processor_id(), ecx, ebx, eax, val); > + } > + else if ( ecx | eax | ebx ) > + { > + printk("CPU%u: TSC:", smp_processor_id()); > + if ( ecx ) > + printk(" core: %uMHz", ecx); > + if ( ebx && eax ) > + printk(" ratio: %u / %u", ebx, eax); > + printk("\n"); > + } > + } > + > + if ( c->cpuid_level >= 0x16 ) > + { > + cpuid(0x16, &eax, &ebx, &ecx, &edx); > + if ( ecx | eax | ebx ) > + { > + printk("CPU%u:", smp_processor_id()); > + if ( ecx ) > + printk(" bus: %uMHz", ecx); > + if ( eax ) > + printk(" base: %uMHz", eax); > + if ( ebx ) > + printk(" max: %uMHz", ebx); > + printk("\n"); > + } > + } > + > + if ( !rdmsr_safe(MSR_INTEL_PLATFORM_INFO, msrval) && > + (uint8_t)(msrval >> 8) ) I would introduce a mask for it would be cleaner, since you use it here and below (and would avoid the casting to uint8_t. > + { > + unsigned int factor = 10000; > + > + if ( c->x86 == 6 ) > + switch ( c->x86_model ) > + { > + case 0x1a: case 0x1e: case 0x1f: case 0x2e: /* Nehalem */ > + case 0x25: case 0x2c: case 0x2f: /* Westmere */ > + factor = 13333; The SDM lists ratio * 100MHz without any notes, why are those models different, is this some errata? > + break; > + } > + > + printk("CPU%u: ", smp_processor_id()); > + if ( (uint8_t)(msrval >> 40) ) > + printk("%u..", (factor * (uint8_t)(msrval >> 40) + 50) / > 100); > + printk("%u MHz\n", (factor * (uint8_t)(msrval >> 8) + 50) / 100); Since you are calculating using Hz, should you use an unsigned long factor to prevent capping at 4GHz? > + } > + } > } > > const struct cpu_dev intel_cpu_dev = { > --- a/xen/include/asm-x86/msr.h > +++ b/xen/include/asm-x86/msr.h > @@ -40,8 +40,8 @@ static inline void wrmsrl(unsigned int m > > /* rdmsr with exception handling */ > #define rdmsr_safe(msr,val) ({\ > - int _rc; \ > - uint32_t lo, hi; \ > + int rc_; \ > + uint32_t lo_, hi_; \ > __asm__ __volatile__( \ > "1: rdmsr\n2:\n" \ > ".section .fixup,\"ax\"\n" \ > @@ -49,15 +49,15 @@ static inline void wrmsrl(unsigned int m > " movl %5,%2\n; jmp 2b\n" \ > ".previous\n" \ > _ASM_EXTABLE(1b, 3b) \ > - : "=a" (lo), "=d" (hi), "=&r" (_rc) \ > + : "=a" (lo_), "=d" (hi_), "=&r" (rc_) \ > : "c" (msr), "2" (0), "i" (-EFAULT)); \ > - val = lo | ((uint64_t)hi << 32); \ > - _rc; }) > + val = lo_ | ((uint64_t)hi_ << 32); \ > + rc_; }) Since you are changing the local variable names, I would just switch rdmsr_safe to a static inline, and drop the underlines. I don't see a reason this has to stay as a macro. > > /* wrmsr with exception handling */ > static inline int wrmsr_safe(unsigned int msr, uint64_t val) > { > - int _rc; > + int rc; > uint32_t lo, hi; > lo = (uint32_t)val; > hi = (uint32_t)(val >> 32); Since you are already playing with this, could you initialize lo and hi at definition time? Thanks, Roger.
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |