[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] x86: retrieve and log CPU frequency information



On Wed, Apr 15, 2020 at 01:55:24PM +0200, Jan Beulich wrote:
> While from just a single Skylake system it is already clear that we
> can't base any of our logic on CPUID leaf 15 [1] (leaf 16 is
> documented to be used for display purposes only anyway), logging this
> information may still give us some reference in case of problems as well
> as for future work. Additionally on the AMD side it is unclear whether
> the deviation between reported and measured frequencies is because of us
> not doing well, or because of nominal and actual frequencies being quite
> far apart.

Can you add some reference to the AMD implementation? I've looked at
the PMs and haven't been able to find a description of some of the
MSRs, like 0xC0010064.

> The chosen variable naming in amd_log_freq() has pointed out a naming
> problem in rdmsr_safe(), which is being taken care of at the same time.
> Symmetrically wrmsr_safe(), being an inline function, also gets an
> unnecessary underscore dropped from one of its local variables.
> 
> [1] With a core crystal clock of 24MHz and a ratio of 216/2, the
>     reported frequency nevertheless is 2600MHz, rather than the to be
>     expected (and calibrated by both us and Linux) 2592MHz.
> 
> Suggested-by: Andrew Cooper <andrew.cooper3@xxxxxxxxxx>
> Signed-off-by: Jan Beulich <jbeulich@xxxxxxxx>
> ---
> TBD: The node ID retrieval using extended leaf 1E implies it won't work
>      on older hardware (pre-Fam15 I think). Besides the Node ID MSR,
>      which doesn't get advertised on my Fam10 box (and it's zero on all
>      processors despite there being two nodes as per the PCI device
>      map), and which isn't even documented for Fam11, Fam12, and Fam14,
>      I didn't find any other means to retrieve the node ID a CPU is
>      associated with - the NodeId register in PCI config space depends
>      on one already knowing the node ID for doing the access, as the
>      device to be used is a function of the node ID.
> 
> --- a/xen/arch/x86/cpu/amd.c
> +++ b/xen/arch/x86/cpu/amd.c
> @@ -532,6 +532,102 @@ static void amd_get_topology(struct cpui
>                                                            : c->cpu_core_id);
>  }
>  
> +void amd_log_freq(const struct cpuinfo_x86 *c)
> +{
> +     unsigned int idx = 0, h;
> +     uint64_t hi, lo, val;
> +
> +     if (c->x86 < 0x10 || c->x86 > 0x19 ||
> +         (c != &boot_cpu_data &&
> +          (!opt_cpu_info || (c->apicid & (c->x86_num_siblings - 1)))))
> +             return;
> +
> +     if (c->x86 < 0x17) {
> +             unsigned int node = 0;
> +             uint64_t nbcfg;
> +
> +             /*
> +              * Make an attempt at determining the node ID, but assume
> +              * symmetric setup (using node 0) if this fails.
> +              */
> +             if (c->extended_cpuid_level >= 0x8000001e &&
> +                 cpu_has(c, X86_FEATURE_TOPOEXT)) {
> +                     node = cpuid_ecx(0x8000001e) & 0xff;
> +                     if (node > 7)
> +                             node = 0;
> +             } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
> +                     rdmsrl(0xC001100C, val);
> +                     node = val & 7;
> +             }
> +
> +             /*
> +              * Enable (and use) Extended Config Space accesses, as we
> +              * can't be certain that MCFG is available here during boot.
> +              */
> +             rdmsrl(MSR_AMD64_NB_CFG, nbcfg);
> +             wrmsrl(MSR_AMD64_NB_CFG,
> +                    nbcfg | (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT));
> +#define PCI_ECS_ADDRESS(sbdf, reg) \
> +    (0x80000000 | ((sbdf).bdf << 8) | ((reg) & 0xfc) | (((reg) & 0xf00) << 
> 16))
> +
> +             for ( ; ; ) {
> +                     pci_sbdf_t sbdf = PCI_SBDF(0, 0, 0x18 | node, 4);
> +
> +                     switch (pci_conf_read32(sbdf, PCI_VENDOR_ID)) {
> +                     case 0x00000000:
> +                     case 0xffffffff:
> +                             /* No device at this SBDF. */
> +                             if (!node)
> +                                     break;
> +                             node = 0;
> +                             continue;
> +
> +                     default:
> +                             /*
> +                              * Core Performance Boost Control, family
> +                              * dependent up to 3 bits starting at bit 2.
> +                              */
> +                             switch (c->x86) {
> +                             case 0x10: idx = 1; break;
> +                             case 0x12: idx = 7; break;
> +                             case 0x14: idx = 7; break;
> +                             case 0x15: idx = 7; break;
> +                             case 0x16: idx = 7; break;
> +                             }
> +                             idx &= pci_conf_read(PCI_ECS_ADDRESS(sbdf,
> +                                                                  0x15c),
> +                                                  0, 4) >> 2;
> +                             break;
> +                     }
> +                     break;
> +             }
> +
> +#undef PCI_ECS_ADDRESS
> +             wrmsrl(MSR_AMD64_NB_CFG, nbcfg);
> +     }
> +
> +     lo = 0; /* gcc may not recognize the loop having at least 5 iterations 
> */
> +     for (h = c->x86 == 0x10 ? 5 : 8; h--; )
> +             if (!rdmsr_safe(0xC0010064 + h, lo) && (lo >> 63))
> +                     break;
> +     if (!(lo >> 63))
> +             return;
> +
> +#define FREQ(v) (c->x86 < 0x17 ? ((((v) & 0x3f) + 0x10) * 100) >> (((v) >> 
> 6) & 7) \
> +                                  : (((v) & 0xff) * 25 * 8) / (((v) >> 8) & 
> 0x3f))
> +     if (idx && idx < h &&
> +         !rdmsr_safe(0xC0010064 + idx, val) && (val >> 63) &&
> +         !rdmsr_safe(0xC0010064, hi) && (hi >> 63))
> +             printk("CPU%u: %lu (%lu..%lu) MHz\n",
> +                    smp_processor_id(), FREQ(val), FREQ(lo), FREQ(hi));
> +     else if (h && !rdmsr_safe(0xC0010064, hi) && (hi >> 63))
> +             printk("CPU%u: %lu..%lu MHz\n",
> +                    smp_processor_id(), FREQ(lo), FREQ(hi));
> +     else
> +             printk("CPU%u: %lu MHz\n", smp_processor_id(), FREQ(lo));
> +#undef FREQ
> +}
> +
>  void early_init_amd(struct cpuinfo_x86 *c)
>  {
>       if (c == &boot_cpu_data)
> @@ -803,6 +899,8 @@ static void init_amd(struct cpuinfo_x86
>               disable_c1_ramping();
>  
>       check_syscfg_dram_mod_en();
> +
> +     amd_log_freq(c);
>  }
>  
>  const struct cpu_dev amd_cpu_dev = {
> --- a/xen/arch/x86/cpu/cpu.h
> +++ b/xen/arch/x86/cpu/cpu.h
> @@ -19,3 +19,4 @@ extern void detect_ht(struct cpuinfo_x86
>  extern bool detect_extended_topology(struct cpuinfo_x86 *c);
>  
>  void early_init_amd(struct cpuinfo_x86 *c);
> +void amd_log_freq(const struct cpuinfo_x86 *c);
> --- a/xen/arch/x86/cpu/hygon.c
> +++ b/xen/arch/x86/cpu/hygon.c
> @@ -99,6 +99,8 @@ static void init_hygon(struct cpuinfo_x8
>               value |= (1 << 27); /* Enable read-only APERF/MPERF bit */
>               wrmsrl(MSR_K7_HWCR, value);
>       }
> +
> +     amd_log_freq(c);
>  }
>  
>  const struct cpu_dev hygon_cpu_dev = {
> --- a/xen/arch/x86/cpu/intel.c
> +++ b/xen/arch/x86/cpu/intel.c
> @@ -378,6 +378,72 @@ static void init_intel(struct cpuinfo_x8
>            ( c->cpuid_level >= 0x00000006 ) &&
>            ( cpuid_eax(0x00000006) & (1u<<2) ) )
>               __set_bit(X86_FEATURE_ARAT, c->x86_capability);
> +

I would split this into a separate helper, ie: intel_log_freq. That
will allow you to exit early and reduce some of the indentation IMO.

> +    if ( (opt_cpu_info && !(c->apicid & (c->x86_num_siblings - 1))) ||
> +         c == &boot_cpu_data )
> +    {
> +        unsigned int eax, ebx, ecx, edx;
> +        uint64_t msrval;
> +
> +        if ( c->cpuid_level >= 0x15 )
> +        {
> +            cpuid(0x15, &eax, &ebx, &ecx, &edx);
> +            if ( ecx && ebx && eax )
> +            {
> +                unsigned long long val = ecx;
> +
> +                val *= ebx;
> +                do_div(val, eax);
> +                printk("CPU%u: TSC: %uMHz * %u / %u = %LuMHz\n",
> +                       smp_processor_id(), ecx, ebx, eax, val);
> +            }
> +            else if ( ecx | eax | ebx )
> +            {
> +                printk("CPU%u: TSC:", smp_processor_id());
> +                if ( ecx )
> +                    printk(" core: %uMHz", ecx);
> +                if ( ebx && eax )
> +                    printk(" ratio: %u / %u", ebx, eax);
> +                printk("\n");
> +            }
> +        }
> +
> +        if ( c->cpuid_level >= 0x16 )
> +        {
> +            cpuid(0x16, &eax, &ebx, &ecx, &edx);
> +            if ( ecx | eax | ebx )
> +            {
> +                printk("CPU%u:", smp_processor_id());
> +                if ( ecx )
> +                    printk(" bus: %uMHz", ecx);
> +                if ( eax )
> +                    printk(" base: %uMHz", eax);
> +                if ( ebx )
> +                    printk(" max: %uMHz", ebx);
> +                printk("\n");
> +            }
> +        }
> +
> +        if ( !rdmsr_safe(MSR_INTEL_PLATFORM_INFO, msrval) &&
> +             (uint8_t)(msrval >> 8) )

I would introduce a mask for it would be cleaner, since you use it
here and below (and would avoid the casting to uint8_t.

> +        {
> +            unsigned int factor = 10000;
> +
> +            if ( c->x86 == 6 )
> +                switch ( c->x86_model )
> +                {
> +                case 0x1a: case 0x1e: case 0x1f: case 0x2e: /* Nehalem */
> +                case 0x25: case 0x2c: case 0x2f: /* Westmere */
> +                    factor = 13333;

The SDM lists ratio * 100MHz without any notes, why are those models
different, is this some errata?

> +                    break;
> +                }
> +
> +            printk("CPU%u: ", smp_processor_id());
> +            if ( (uint8_t)(msrval >> 40) )
> +                printk("%u..", (factor * (uint8_t)(msrval >> 40) + 50) / 
> 100);
> +            printk("%u MHz\n", (factor * (uint8_t)(msrval >> 8) + 50) / 100);

Since you are calculating using Hz, should you use an unsigned long
factor to prevent capping at 4GHz?

> +        }
> +    }
>  }
>  
>  const struct cpu_dev intel_cpu_dev = {
> --- a/xen/include/asm-x86/msr.h
> +++ b/xen/include/asm-x86/msr.h
> @@ -40,8 +40,8 @@ static inline void wrmsrl(unsigned int m
>  
>  /* rdmsr with exception handling */
>  #define rdmsr_safe(msr,val) ({\
> -    int _rc; \
> -    uint32_t lo, hi; \
> +    int rc_; \
> +    uint32_t lo_, hi_; \
>      __asm__ __volatile__( \
>          "1: rdmsr\n2:\n" \
>          ".section .fixup,\"ax\"\n" \
> @@ -49,15 +49,15 @@ static inline void wrmsrl(unsigned int m
>          "   movl %5,%2\n; jmp 2b\n" \
>          ".previous\n" \
>          _ASM_EXTABLE(1b, 3b) \
> -        : "=a" (lo), "=d" (hi), "=&r" (_rc) \
> +        : "=a" (lo_), "=d" (hi_), "=&r" (rc_) \
>          : "c" (msr), "2" (0), "i" (-EFAULT)); \
> -    val = lo | ((uint64_t)hi << 32); \
> -    _rc; })
> +    val = lo_ | ((uint64_t)hi_ << 32); \
> +    rc_; })

Since you are changing the local variable names, I would just switch
rdmsr_safe to a static inline, and drop the underlines. I don't see a
reason this has to stay as a macro.

>  
>  /* wrmsr with exception handling */
>  static inline int wrmsr_safe(unsigned int msr, uint64_t val)
>  {
> -    int _rc;
> +    int rc;
>      uint32_t lo, hi;
>      lo = (uint32_t)val;
>      hi = (uint32_t)(val >> 32);

Since you are already playing with this, could you initialize lo and
hi at definition time?

Thanks, Roger.



 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.