[PATCH V2 5/7] hv: add ACRN CPU frequency initializer


Eddie Dong
 

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Zhou, Wu
Sent: Friday, September 2, 2022 7:52 AM
To: acrn-dev@...
Cc: Zhou, Wu <wu.zhou@...>
Subject: [acrn-dev] [PATCH V2 5/7] hv: add ACRN CPU frequency initializer

The design of ACRN CPU performance management is to let hardware do the
autonomous frequency selection(or set to a fixed value), and remove guest's
capability to control CPU frequency.

This patch is to implement the CPU frequency initializer, which will setup CPU
frequency base on the performance policy type.

Two performance policy types are provided for user to choose from:
- 'Performance': CPU runs at its CPU runs at its maximum frequency.
Enable hardware autonomous frequency selection if HWP is presented.
- 'Nominal': CPU runs at its guaranteed frequency.

The policy type is passed to hypervisor through boot parameter, as either
'cpu_perf_policy=Nominal' or 'cpu_perf_policy=Performance'.
The default type is 'Performance'.

Both HWP and ACPI p-state are supported. HWP is the first choice, for it
provides hardware autonomous frequency selection, while keeps frequency
transaction time low.

Two functions are added to the hypervisor to call:
- init_cpufreq(): called by BSP at start up time. It processes the
boot parameters, and enables HWP if presented.
- apply_initial_cpufreq_policy(): called after init_cpufreq().
applies initial CPU frequency policy setting for each core.
It uses a set of frequency limits data struct to quickly decide
what the highest/nominal frequency is. The frequency limits are
generated by config-tools.

The hypervisor will not be governing CPU frequency after initial policy is
applied.

Cores running RTVMs are fixed to nominal/guaranteed frequency, to get more
certainty in latency. This is done by setting the core's frequency limits to
highest=lowest=nominal in config-tools.

Signed-off-by: Wu Zhou <wu.zhou@...>
---
hypervisor/arch/x86/cpu.c | 5 ++
hypervisor/arch/x86/pm.c | 76 +++++++++++++++++++++++
hypervisor/include/arch/x86/asm/board.h | 1 +
hypervisor/include/arch/x86/asm/host_pm.h | 2 +
hypervisor/include/public/acrn_common.h | 16 +++++
5 files changed, 100 insertions(+)

diff --git a/hypervisor/arch/x86/cpu.c b/hypervisor/arch/x86/cpu.c index
0b51fb9c8..f96dd0036 100644
--- a/hypervisor/arch/x86/cpu.c
+++ b/hypervisor/arch/x86/cpu.c
@@ -24,6 +24,7 @@
#include <version.h>
#include <asm/vmx.h>
#include <asm/msr.h>
+#include <asm/host_pm.h>
#include <ptdev.h>
#include <logmsg.h>
#include <asm/rdt.h>
@@ -156,6 +157,8 @@ void init_pcpu_pre(bool is_bsp)

load_pcpu_state_data();

+ init_cpufreq();
+
init_e820();

/* reserve ppt buffer from e820 */
@@ -315,6 +318,8 @@ void init_pcpu_post(uint16_t pcpu_id)
panic("failed to initialize software SRAM!");
}

+ apply_initial_cpufreq_policy();
+
init_sched(pcpu_id);

#ifdef CONFIG_RDT_ENABLED
diff --git a/hypervisor/arch/x86/pm.c b/hypervisor/arch/x86/pm.c index
9af9362cc..7fdd202a2 100644
--- a/hypervisor/arch/x86/pm.c
+++ b/hypervisor/arch/x86/pm.c
@@ -19,6 +19,8 @@
#include <asm/lapic.h>
#include <asm/tsc.h>
#include <delay.h>
+#include <asm/board.h>
+#include <asm/cpuid.h>

struct cpu_context cpu_ctx;

@@ -271,3 +273,77 @@ void reset_host(void)
asm_pause();
}
}
+
+static enum acrn_cpufreq_policy_type cpufreq_policy =
+CPUFREQ_POLICY_PERFORMANCE;
+
+void init_cpufreq(void)
Base on what it does. This is not initialize CPU frequency, but init_frequency_policy.

+{
+ uint32_t cpuid_06_eax, unused;
+ struct acrn_boot_info *abi = get_acrn_boot_info();
+ const char *cmd_src = abi->cmdline;
+
+ /*
+ * Parse cmdline, decide which policy type to use.
+ * User can either specify cpu_perf_policy=Nominal or
cpu_perf_policy=Performance
+ * The default type is 'Performance'
+ */
+ if(strstr_s(cmd_src, MAX_BOOTARGS_SIZE,
"cpu_perf_policy=Nominal", 24) != NULL) {
+ cpufreq_policy = CPUFREQ_POLICY_NOMINAL;
+ } else {
+ cpufreq_policy = CPUFREQ_POLICY_PERFORMANCE;
This is default value. Do we need this redundant assignment?

+ }
+
+ cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused,
&unused);
+ if ((cpuid_06_eax & CPUID_EAX_HWP) != 0) {
+ /* If HWP is available, enable HWP early. This will unlock
other HWP MSRs. */
+ msr_write(MSR_IA32_PM_ENABLE, 1U);
This part can be viewed as "init_cpufreq" or "init_frequency_policy" ...
+ }
+}
+
+/*
+ * This Function is to be called by each pcpu after init_cpufreq().
+ * It applies the initial frequency policy, which can be specified from boot
parameters.
+ * - cpu_perf_policy=Performance: HWP autonomous selection, between
highest HWP level and
+ * lowest HWP level. If HWP is not avaliable, the frequency is fixed to
highest p-state.
+ * - cpu_perf_policy=Nominal: frequency is fixed to guaranteed HWP level or
nominal p-state.
+ * The default policy is 'Performance'.
+ *
+ * ACRN will not be governing pcpu's frequency after this.
+ */
+void apply_initial_cpufreq_policy(void)
+{
+ uint64_t highest_lvl_req = 0xff, lowest_lvl_req = 1, reg;
+ uint8_t pstate_req = 0;
+ struct acrn_cpufreq_limits *limits = &cpufreq_limits[get_pcpu_id()];
+ uint32_t cpuid_06_eax, cpuid_01_ecx, unused;
+
+ cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused,
&unused);
+ cpuid_subleaf(0x1U, 0U, &unused, &unused, &cpuid_01_ecx,
&unused);
+ /* Both HWP and ACPI p-state are supported. HWP is the first choise.
*/
+ if ((cpuid_06_eax & CPUID_EAX_HWP) != 0) {
+ if (cpufreq_policy == CPUFREQ_POLICY_PERFORMANCE) {
+ /* CPU frequency will be autonomously selected.
between highest level and lowest level*/
+ highest_lvl_req = limits->highest_hwp_lvl;
+ lowest_lvl_req = limits->lowest_hwp_lvl;
+ } else if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
+ /* set highest_lvl = lowest_lvl, CPU frequency will be
fixed */
+ highest_lvl_req = limits->guaranteed_hwp_lvl;
+ lowest_lvl_req = limits->guaranteed_hwp_lvl;
+ }
+
+ reg = (0x80UL << 24) | (0x00UL << 16) |
24U ....
Please check with FuSa requirement.

(((uint64_t)highest_lvl_req) << 8) | ((uint64_t)lowest_lvl_req);
+ msr_write(MSR_IA32_HWP_REQUEST, reg);
+ } else if ((cpuid_01_ecx & CPUID_ECX_EST) != 0) {
+ struct cpu_state_info *pm_s_state_data =
get_cpu_pm_state_info();
+
+ if (cpufreq_policy == CPUFREQ_POLICY_PERFORMANCE) {
+ pstate_req = limits->highest_pstate;
+ } else if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
+ pstate_req = limits->nominal_pstate;
+ }
+
+ if (pstate_req < pm_s_state_data->px_cnt) {
+ msr_write(MSR_IA32_PERF_CTL, pm_s_state_data-
px_data[pstate_req].control);
+ }
Assuming offline tool guarantee this will never come to "else" branch.
How about to add ASSERT for else branch to double check at least in debug mode so that developers can quickly know after switching to debug mode?

+ }
FUSA requires us to have "else" branch here.
In the mean time, if we come to the else branch, what should we do? Refuse to boot?

+}
diff --git a/hypervisor/include/arch/x86/asm/board.h
b/hypervisor/include/arch/x86/asm/board.h
index 56bbeb9c8..25b2dcc32 100644
--- a/hypervisor/include/arch/x86/asm/board.h
+++ b/hypervisor/include/arch/x86/asm/board.h
@@ -34,6 +34,7 @@ extern struct rdt_type
res_cap_info[RDT_NUM_RESOURCES]; #endif

extern const struct cpu_state_table board_cpu_state_tbl;
+extern struct acrn_cpufreq_limits cpufreq_limits[MAX_PCPU_NUM];
extern const union pci_bdf plat_hidden_pdevs[MAX_HIDDEN_PDEVS_NUM];
extern const struct vmsix_on_msi_info
vmsix_on_msi_devs[MAX_VMSIX_ON_MSI_PDEVS_NUM];

diff --git a/hypervisor/include/arch/x86/asm/host_pm.h
b/hypervisor/include/arch/x86/asm/host_pm.h
index b8fb8a307..ccdff11c8 100644
--- a/hypervisor/include/arch/x86/asm/host_pm.h
+++ b/hypervisor/include/arch/x86/asm/host_pm.h
@@ -39,5 +39,7 @@ extern void restore_s3_context(void); struct
cpu_state_info *get_cpu_pm_state_info(void); struct acpi_reset_reg
*get_host_reset_reg_data(void); void reset_host(void);
+void init_cpufreq(void);
+void apply_initial_cpufreq_policy(void);

#endif /* HOST_PM_H */
diff --git a/hypervisor/include/public/acrn_common.h
b/hypervisor/include/public/acrn_common.h
index fee71a655..87bf40bc6 100644
--- a/hypervisor/include/public/acrn_common.h
+++ b/hypervisor/include/public/acrn_common.h
@@ -522,6 +522,22 @@ struct acrn_pstate_data {
uint64_t status; /* success indicator */
};

+enum acrn_cpufreq_policy_type {
+ CPUFREQ_POLICY_PERFORMANCE,
+ CPUFREQ_POLICY_NOMINAL,
+};
+
+struct acrn_cpufreq_limits {
+ /* Performance levels for HWP */
+ uint8_t guaranteed_hwp_lvl;
+ uint8_t highest_hwp_lvl;
+ uint8_t lowest_hwp_lvl;
+ /* Index for the p-state table _PSS */
+ uint8_t nominal_pstate;
+ uint8_t highest_pstate;
+ uint8_t lowest_pstate;
lowest_pstate is not used?
BTW, if it is simple one value for each frequency mode in ACPI mode, then we can use nominal_xxx & performance_xxx.

+};
+
struct acpi_sx_pkg {
uint8_t val_pm1a;
uint8_t val_pm1b;
--
2.25.1





Zhou, Wu
 

The design of ACRN CPU performance management is to let hardware
do the autonomous frequency selection(or set to a fixed value),
and remove guest's capability to control CPU frequency.

This patch is to implement the CPU frequency initializer, which will
setup CPU frequency base on the performance policy type.

Two performance policy types are provided for user to choose from:
- 'Performance': CPU runs at its CPU runs at its maximum frequency.
Enable hardware autonomous frequency selection if HWP is presented.
- 'Nominal': CPU runs at its guaranteed frequency.

The policy type is passed to hypervisor through boot parameter, as
either 'cpu_perf_policy=Nominal' or 'cpu_perf_policy=Performance'.
The default type is 'Performance'.

Both HWP and ACPI p-state are supported. HWP is the first choice, for
it provides hardware autonomous frequency selection, while keeps
frequency transaction time low.

Two functions are added to the hypervisor to call:
- init_cpufreq(): called by BSP at start up time. It processes the
boot parameters, and enables HWP if presented.
- apply_initial_cpufreq_policy(): called after init_cpufreq().
applies initial CPU frequency policy setting for each core.
It uses a set of frequency limits data struct to quickly decide
what the highest/nominal frequency is. The frequency limits are
generated by config-tools.

The hypervisor will not be governing CPU frequency after initial policy
is applied.

Cores running RTVMs are fixed to nominal/guaranteed frequency, to get
more certainty in latency. This is done by setting the core's frequency
limits to highest=lowest=nominal in config-tools.

Signed-off-by: Wu Zhou <wu.zhou@...>
---
hypervisor/arch/x86/cpu.c | 5 ++
hypervisor/arch/x86/pm.c | 76 +++++++++++++++++++++++
hypervisor/include/arch/x86/asm/board.h | 1 +
hypervisor/include/arch/x86/asm/host_pm.h | 2 +
hypervisor/include/public/acrn_common.h | 16 +++++
5 files changed, 100 insertions(+)

diff --git a/hypervisor/arch/x86/cpu.c b/hypervisor/arch/x86/cpu.c
index 0b51fb9c8..f96dd0036 100644
--- a/hypervisor/arch/x86/cpu.c
+++ b/hypervisor/arch/x86/cpu.c
@@ -24,6 +24,7 @@
#include <version.h>
#include <asm/vmx.h>
#include <asm/msr.h>
+#include <asm/host_pm.h>
#include <ptdev.h>
#include <logmsg.h>
#include <asm/rdt.h>
@@ -156,6 +157,8 @@ void init_pcpu_pre(bool is_bsp)

load_pcpu_state_data();

+ init_cpufreq();
+
init_e820();

/* reserve ppt buffer from e820 */
@@ -315,6 +318,8 @@ void init_pcpu_post(uint16_t pcpu_id)
panic("failed to initialize software SRAM!");
}

+ apply_initial_cpufreq_policy();
+
init_sched(pcpu_id);

#ifdef CONFIG_RDT_ENABLED
diff --git a/hypervisor/arch/x86/pm.c b/hypervisor/arch/x86/pm.c
index 9af9362cc..7fdd202a2 100644
--- a/hypervisor/arch/x86/pm.c
+++ b/hypervisor/arch/x86/pm.c
@@ -19,6 +19,8 @@
#include <asm/lapic.h>
#include <asm/tsc.h>
#include <delay.h>
+#include <asm/board.h>
+#include <asm/cpuid.h>

struct cpu_context cpu_ctx;

@@ -271,3 +273,77 @@ void reset_host(void)
asm_pause();
}
}
+
+static enum acrn_cpufreq_policy_type cpufreq_policy = CPUFREQ_POLICY_PERFORMANCE;
+
+void init_cpufreq(void)
+{
+ uint32_t cpuid_06_eax, unused;
+ struct acrn_boot_info *abi = get_acrn_boot_info();
+ const char *cmd_src = abi->cmdline;
+
+ /*
+ * Parse cmdline, decide which policy type to use.
+ * User can either specify cpu_perf_policy=Nominal or cpu_perf_policy=Performance
+ * The default type is 'Performance'
+ */
+ if(strstr_s(cmd_src, MAX_BOOTARGS_SIZE, "cpu_perf_policy=Nominal", 24) != NULL) {
+ cpufreq_policy = CPUFREQ_POLICY_NOMINAL;
+ } else {
+ cpufreq_policy = CPUFREQ_POLICY_PERFORMANCE;
+ }
+
+ cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused, &unused);
+ if ((cpuid_06_eax & CPUID_EAX_HWP) != 0) {
+ /* If HWP is available, enable HWP early. This will unlock other HWP MSRs. */
+ msr_write(MSR_IA32_PM_ENABLE, 1U);
+ }
+}
+
+/*
+ * This Function is to be called by each pcpu after init_cpufreq().
+ * It applies the initial frequency policy, which can be specified from boot parameters.
+ * - cpu_perf_policy=Performance: HWP autonomous selection, between highest HWP level and
+ * lowest HWP level. If HWP is not avaliable, the frequency is fixed to highest p-state.
+ * - cpu_perf_policy=Nominal: frequency is fixed to guaranteed HWP level or nominal p-state.
+ * The default policy is 'Performance'.
+ *
+ * ACRN will not be governing pcpu's frequency after this.
+ */
+void apply_initial_cpufreq_policy(void)
+{
+ uint64_t highest_lvl_req = 0xff, lowest_lvl_req = 1, reg;
+ uint8_t pstate_req = 0;
+ struct acrn_cpufreq_limits *limits = &cpufreq_limits[get_pcpu_id()];
+ uint32_t cpuid_06_eax, cpuid_01_ecx, unused;
+
+ cpuid_subleaf(0x6U, 0U, &cpuid_06_eax, &unused, &unused, &unused);
+ cpuid_subleaf(0x1U, 0U, &unused, &unused, &cpuid_01_ecx, &unused);
+ /* Both HWP and ACPI p-state are supported. HWP is the first choise. */
+ if ((cpuid_06_eax & CPUID_EAX_HWP) != 0) {
+ if (cpufreq_policy == CPUFREQ_POLICY_PERFORMANCE) {
+ /* CPU frequency will be autonomously selected. between highest level and lowest level*/
+ highest_lvl_req = limits->highest_hwp_lvl;
+ lowest_lvl_req = limits->lowest_hwp_lvl;
+ } else if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
+ /* set highest_lvl = lowest_lvl, CPU frequency will be fixed */
+ highest_lvl_req = limits->guaranteed_hwp_lvl;
+ lowest_lvl_req = limits->guaranteed_hwp_lvl;
+ }
+
+ reg = (0x80UL << 24) | (0x00UL << 16) | (((uint64_t)highest_lvl_req) << 8) | ((uint64_t)lowest_lvl_req);
+ msr_write(MSR_IA32_HWP_REQUEST, reg);
+ } else if ((cpuid_01_ecx & CPUID_ECX_EST) != 0) {
+ struct cpu_state_info *pm_s_state_data = get_cpu_pm_state_info();
+
+ if (cpufreq_policy == CPUFREQ_POLICY_PERFORMANCE) {
+ pstate_req = limits->highest_pstate;
+ } else if (cpufreq_policy == CPUFREQ_POLICY_NOMINAL) {
+ pstate_req = limits->nominal_pstate;
+ }
+
+ if (pstate_req < pm_s_state_data->px_cnt) {
+ msr_write(MSR_IA32_PERF_CTL, pm_s_state_data->px_data[pstate_req].control);
+ }
+ }
+}
diff --git a/hypervisor/include/arch/x86/asm/board.h b/hypervisor/include/arch/x86/asm/board.h
index 56bbeb9c8..25b2dcc32 100644
--- a/hypervisor/include/arch/x86/asm/board.h
+++ b/hypervisor/include/arch/x86/asm/board.h
@@ -34,6 +34,7 @@ extern struct rdt_type res_cap_info[RDT_NUM_RESOURCES];
#endif

extern const struct cpu_state_table board_cpu_state_tbl;
+extern struct acrn_cpufreq_limits cpufreq_limits[MAX_PCPU_NUM];
extern const union pci_bdf plat_hidden_pdevs[MAX_HIDDEN_PDEVS_NUM];
extern const struct vmsix_on_msi_info vmsix_on_msi_devs[MAX_VMSIX_ON_MSI_PDEVS_NUM];

diff --git a/hypervisor/include/arch/x86/asm/host_pm.h b/hypervisor/include/arch/x86/asm/host_pm.h
index b8fb8a307..ccdff11c8 100644
--- a/hypervisor/include/arch/x86/asm/host_pm.h
+++ b/hypervisor/include/arch/x86/asm/host_pm.h
@@ -39,5 +39,7 @@ extern void restore_s3_context(void);
struct cpu_state_info *get_cpu_pm_state_info(void);
struct acpi_reset_reg *get_host_reset_reg_data(void);
void reset_host(void);
+void init_cpufreq(void);
+void apply_initial_cpufreq_policy(void);

#endif /* HOST_PM_H */
diff --git a/hypervisor/include/public/acrn_common.h b/hypervisor/include/public/acrn_common.h
index fee71a655..87bf40bc6 100644
--- a/hypervisor/include/public/acrn_common.h
+++ b/hypervisor/include/public/acrn_common.h
@@ -522,6 +522,22 @@ struct acrn_pstate_data {
uint64_t status; /* success indicator */
};

+enum acrn_cpufreq_policy_type {
+ CPUFREQ_POLICY_PERFORMANCE,
+ CPUFREQ_POLICY_NOMINAL,
+};
+
+struct acrn_cpufreq_limits {
+ /* Performance levels for HWP */
+ uint8_t guaranteed_hwp_lvl;
+ uint8_t highest_hwp_lvl;
+ uint8_t lowest_hwp_lvl;
+ /* Index for the p-state table _PSS */
+ uint8_t nominal_pstate;
+ uint8_t highest_pstate;
+ uint8_t lowest_pstate;
+};
+
struct acpi_sx_pkg {
uint8_t val_pm1a;
uint8_t val_pm1b;
--
2.25.1