Date   

[RFC PATCH v2 2/2] hv: vpci: trap PCIe ECAM access for SOS

Li, Fei1
 

SOS will use PCIe ECAM access PCIe external configuration space. HV should trap this
access for security(Now pre-launched VM doesn't want to support PCI ECAM; post-launched
VM trap PCIe ECAM access in DM).

Tracked-On: #3475
Signed-off-by: Li Fei1 <fei1.li@...>
---
hypervisor/dm/vpci/vpci.c | 43 +++++++++++++++++++++++++++++++++++-
hypervisor/include/dm/vpci.h | 7 +++---
hypervisor/include/hw/pci.h | 5 ++++-
3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/hypervisor/dm/vpci/vpci.c b/hypervisor/dm/vpci/vpci.c
index 78fa17cb..528bd93a 100644
--- a/hypervisor/dm/vpci/vpci.c
+++ b/hypervisor/dm/vpci/vpci.c
@@ -29,7 +29,9 @@

#include <vm.h>
#include <vtd.h>
+#include <io.h>
#include <mmu.h>
+#include <ept.h>
#include <logmsg.h>
#include "vpci_priv.h"
#include "pci_dev.h"
@@ -172,6 +174,38 @@ static bool pci_cfgdata_io_write(struct acrn_vcpu *vcpu, uint16_t addr, size_t b
return true;
}

+/**
+ * @pre io_req != NULL && private_data != NULL
+ */
+static int32_t vpci_handle_mmconfig_access(struct io_request *io_req, void *private_data)
+{
+ struct mmio_request *mmio = &io_req->reqs.mmio;
+ struct acrn_vpci *vpci = (struct acrn_vpci *)private_data;
+ uint64_t pci_mmcofg_base = vpci->pci_mmcfg_base;
+ uint64_t address = mmio->address;
+ uint32_t reg_num = (uint32_t)(address & 0xfffUL);
+ union pci_bdf bdf;
+
+ /**
+ * Enhanced Configuration Address Mapping
+ * A[(20+n-1):20] Bus Number 1 ≤ n ≤ 8
+ * A[19:15] Device Number
+ * A[14:12] Function Number
+ * A[11:8] Extended Register Number
+ * A[7:2] Register Number
+ * A[1:0] Along with size of the access, used to generate Byte Enables
+ */
+ bdf.value = (uint16_t)((address - pci_mmcofg_base) >> 12U);
+
+ if (mmio->direction == REQUEST_READ) {
+ read_cfg(vpci, bdf, reg_num, mmio->size, (uint32_t *)&mmio->value);
+ } else {
+ write_cfg(vpci, bdf, reg_num, mmio->size, (uint32_t)mmio->value);
+ }
+
+ return 0;
+}
+
/**
* @pre vm != NULL
* @pre vm->vm_id < CONFIG_MAX_VM_NUM
@@ -189,6 +223,7 @@ void vpci_init(struct acrn_vm *vm)
};

struct acrn_vm_config *vm_config;
+ uint64_t pci_mmcfg_base;

vm->vpci.vm = vm;
vm->iommu = create_iommu_domain(vm->vm_id, hva2hpa(vm->arch_vm.nworld_eptp), 48U);
@@ -197,8 +232,14 @@ void vpci_init(struct acrn_vm *vm)

vm_config = get_vm_config(vm->vm_id);
switch (vm_config->load_order) {
- case PRE_LAUNCHED_VM:
case SOS_VM:
+ pci_mmcfg_base = get_mmcfg_base();
+ vm->vpci.pci_mmcfg_base = pci_mmcfg_base;
+ register_mmio_emulation_handler(vm, vpci_handle_mmconfig_access,
+ pci_mmcfg_base, pci_mmcfg_base + PCI_MMCONFIG_SIZE, &vm->vpci);
+ ept_del_mr(vm, (uint64_t *)vm->arch_vm.nworld_eptp, pci_mmcfg_base, PCI_MMCONFIG_SIZE);
+ /* falls through */
+ case PRE_LAUNCHED_VM:
/*
* SOS: intercept port CF8 only.
* UOS or pre-launched VM: register handler for CF8 only and I/O requests to CF9/CFA/CFB are
diff --git a/hypervisor/include/dm/vpci.h b/hypervisor/include/dm/vpci.h
index a5110755..c16aacbb 100644
--- a/hypervisor/include/dm/vpci.h
+++ b/hypervisor/include/dm/vpci.h
@@ -70,9 +70,9 @@ struct pci_msix {
};

union pci_cfgdata {
- uint8_t data_8[PCI_REGMAX + 1U];
- uint16_t data_16[(PCI_REGMAX + 1U) >> 1U];
- uint32_t data_32[(PCI_REGMAX + 1U) >> 2U];
+ uint8_t data_8[PCIE_CONFIG_SPACE_SIZE];
+ uint16_t data_16[PCIE_CONFIG_SPACE_SIZE >> 1U];
+ uint32_t data_32[PCIE_CONFIG_SPACE_SIZE >> 2U];
};

struct pci_vdev;
@@ -123,6 +123,7 @@ struct acrn_vpci {
spinlock_t lock;
struct acrn_vm *vm;
union pci_cfg_addr_reg addr;
+ uint64_t pci_mmcfg_base;
uint32_t pci_vdev_cnt;
struct pci_vdev pci_vdevs[CONFIG_MAX_PCI_DEV_NUM];
};
diff --git a/hypervisor/include/hw/pci.h b/hypervisor/include/hw/pci.h
index 42fc3f9d..0dd98fbf 100644
--- a/hypervisor/include/hw/pci.h
+++ b/hypervisor/include/hw/pci.h
@@ -48,9 +48,12 @@
#define PCI_SLOTMAX 0x1FU
#define PCI_FUNCMAX 0x7U
#define PCI_BAR_COUNT 0x6U
-#define PCI_REGMAX 0xFFU
#define PCI_REGMASK 0xFCU

+#define PCI_CONFIG_SPACE_SIZE 0x100U
+#define PCIE_CONFIG_SPACE_SIZE 0x1000U
+#define PCI_MMCONFIG_SIZE 0x10000000U
+
/* I/O ports */
#define PCI_CONFIG_ADDR 0xCF8U
#define PCI_CONFIG_DATA 0xCFCU
--
2.17.1


[RFC PATCH v2 1/2] hv: pci: use ECAM to access PCIe Configuration Space

Li, Fei1
 

Use Enhanced Configuration Access Mechanism (MMIO) instead of PCI-compatible
Configuration Mechanism (IO port) to access PCIe Configuration Space

Tracked-On: #3475
Signed-off-by: Li Fei1 <fei1.li@...>
---
hypervisor/hw/pci.c | 50 ++++++++++++++++++++++++++-------------------
1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/hypervisor/hw/pci.c b/hypervisor/hw/pci.c
index ea301338..4bc230cd 100644
--- a/hypervisor/hw/pci.c
+++ b/hypervisor/hw/pci.c
@@ -34,6 +34,7 @@
#include <types.h>
#include <spinlock.h>
#include <io.h>
+#include <pgtable.h>
#include <pci.h>
#include <uart16550.h>
#include <logmsg.h>
@@ -86,67 +87,74 @@ uint32_t pci_lookup_drhd_for_pbdf(uint16_t pbdf)
return drhd_index;
}

-static uint32_t pci_pdev_calc_address(union pci_bdf bdf, uint32_t offset)
+/*
+ * @pre offset < 0x1000U
+ */
+static inline uint32_t pci_pdev_calc_address(union pci_bdf bdf, uint32_t offset)
{
- uint32_t addr = (uint32_t)bdf.value;
-
- addr <<= 8U;
- addr |= (offset | PCI_CFG_ENABLE);
- return addr;
+ return pci_mmcfg_base + (((uint32_t)bdf.value << 12U) | offset);
}

+/*
+ * @pre bytes == 1U || bytes == 2U || bytes == 4U
+ */
uint32_t pci_pdev_read_cfg(union pci_bdf bdf, uint32_t offset, uint32_t bytes)
{
uint32_t addr;
uint32_t val;
+ void *mmcfg_base_hva;

addr = pci_pdev_calc_address(bdf, offset);
+ mmcfg_base_hva = hpa2hva(addr);

spinlock_obtain(&pci_device_lock);
+ stac();

- /* Write address to ADDRESS register */
- pio_write32(addr, (uint16_t)PCI_CONFIG_ADDR);
-
- /* Read result from DATA register */
switch (bytes) {
case 1U:
- val = (uint32_t)pio_read8((uint16_t)PCI_CONFIG_DATA + ((uint16_t)offset & 3U));
+ val = (uint32_t)mmio_read8(mmcfg_base_hva);
break;
case 2U:
- val = (uint32_t)pio_read16((uint16_t)PCI_CONFIG_DATA + ((uint16_t)offset & 2U));
+ val = (uint32_t)mmio_read16(mmcfg_base_hva);
break;
default:
- val = pio_read32((uint16_t)PCI_CONFIG_DATA);
+ val = mmio_read32(mmcfg_base_hva);
break;
}
+
+ clac();
spinlock_release(&pci_device_lock);

return val;
}

+/*
+ * @pre bytes == 1U || bytes == 2U || bytes == 4U
+ */
void pci_pdev_write_cfg(union pci_bdf bdf, uint32_t offset, uint32_t bytes, uint32_t val)
{
uint32_t addr;
-
- spinlock_obtain(&pci_device_lock);
+ void *mmcfg_base_hva;

addr = pci_pdev_calc_address(bdf, offset);
+ mmcfg_base_hva = hpa2hva(addr);

- /* Write address to ADDRESS register */
- pio_write32(addr, (uint16_t)PCI_CONFIG_ADDR);
+ spinlock_obtain(&pci_device_lock);
+ stac();

- /* Write value to DATA register */
switch (bytes) {
case 1U:
- pio_write8((uint8_t)val, (uint16_t)PCI_CONFIG_DATA + ((uint16_t)offset & 3U));
+ mmio_write8((uint8_t)val, mmcfg_base_hva);
break;
case 2U:
- pio_write16((uint16_t)val, (uint16_t)PCI_CONFIG_DATA + ((uint16_t)offset & 2U));
+ mmio_write16((uint16_t)val, mmcfg_base_hva);
break;
default:
- pio_write32(val, (uint16_t)PCI_CONFIG_DATA);
+ mmio_write32(val, mmcfg_base_hva);
break;
}
+
+ clac();
spinlock_release(&pci_device_lock);
}

--
2.17.1


[RFC PATCH v2 0/2] Add PCIe ECAM access support

Li, Fei1
 

v2:
Remove PCIe ECAM access emulation for pre-launched VM since it doesn't mean to support PCIe ECAM access.


v1:
Now ACRN HV only trap PCI mmconfig access for pre-launched VM and SOS and emulate
the front 256 bytes configuration space access add pass through the PCI external
configuration space access.

Li Fei1 (2):
hv: pci: use ECAM to access PCIe Configuration Space
hv: vpci: trap PCIe ECAM access for SOS

hypervisor/dm/vpci/vpci.c | 43 ++++++++++++++++++++++++++++++-
hypervisor/hw/pci.c | 50 +++++++++++++++++++++---------------
hypervisor/include/dm/vpci.h | 7 ++---
hypervisor/include/hw/pci.h | 5 +++-
4 files changed, 79 insertions(+), 26 deletions(-)

--
2.17.1


Re: [PATCH v2 8/9] hv: sched: suspend/resume scheduling in suspend-to-ram path

Shuo A Liu
 

On Fri 6.Dec'19 at 13:31:09 +0800, Dong, Eddie wrote:
Suspend/resume needs to be done for each AP...
BTW, if we don't do suspend/resume, seems it should be fine as if the timer if restored (physical TSC is restored).
OK. Will drop this patch.


-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 8/9] hv: sched: suspend/resume scheduling in
suspend-to-ram path

Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/arch/x86/pm.c | 2 ++
hypervisor/common/schedule.c | 18 ++++++++++++++++++
hypervisor/include/common/schedule.h | 2 ++
3 files changed, 22 insertions(+)

diff --git a/hypervisor/arch/x86/pm.c b/hypervisor/arch/x86/pm.c index
bbce3c8..07b712e 100644
--- a/hypervisor/arch/x86/pm.c
+++ b/hypervisor/arch/x86/pm.c
@@ -194,6 +194,7 @@ void host_enter_s3(const struct pm_s_state_data
*sstate_data, uint32_t pm1a_cnt_
CPU_IRQ_DISABLE();
vmx_off();

+ suspend_sched();
suspend_console();
suspend_ioapic();
suspend_iommu();
@@ -225,6 +226,7 @@ void host_enter_s3(const struct pm_s_state_data
*sstate_data, uint32_t pm1a_cnt_

/* console must be resumed after TSC restored since it will setup timer
base on TSC */
resume_console();
+ resume_sched();
}

void reset_host(void)
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 681c17a..32a60bd 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -240,6 +240,24 @@ void kick_thread(const struct thread_object *obj)
release_schedule_lock(pcpu_id, rflag); }

+void suspend_sched(void)
+{
+ struct sched_control *ctl = &per_cpu(sched_ctl, get_pcpu_id());
+
+ if (ctl->scheduler->deinit != NULL) {
+ ctl->scheduler->deinit(ctl);
+ }
+}
+
+void resume_sched(void)
+{
+ struct sched_control *ctl = &per_cpu(sched_ctl, get_pcpu_id());
+
+ if (ctl->scheduler->init != NULL) {
+ ctl->scheduler->init(ctl);
+ }
+}
+
void yield(void)
{
make_reschedule_request(get_pcpu_id(), DEL_MODE_IPI); diff --git
a/hypervisor/include/common/schedule.h
b/hypervisor/include/common/schedule.h
index a237cdb..2753ef7 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -111,6 +111,8 @@ void wake_thread(struct thread_object *obj); void
kick_thread(const struct thread_object *obj); void yield(void); void
schedule(void);
+void suspend_sched(void);
+void resume_sched(void);

void arch_switch_to(void *prev_sp, void *next_sp); void
run_idle_thread(void);
--
2.8.3



Re: [PATCH v2 4/9] hv: sched_iorr: add some interfaces implementation of sched_iorr

Shuo A Liu
 

Hi Eddie,

On Fri 6.Dec'19 at 10:57:44 +0800, Dong, Eddie wrote:


-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 4/9] hv: sched_iorr: add some interfaces
implementation of sched_iorr

Implement .sleep/.wake/.pick_next of sched_iorr.
In .pick_next, we count current object's timeslice and pick the next avaiable
one. The policy is
1) get the first item in runqueue firstly
2) if object picked has no time_cycles, replenish it pick this one
3) At least take one idle sched object if we have no runnable object
after step 1) and 2)
In .wake, we start the tick if we have more than one active thread_object in
runqueue. In .sleep, stop the tick timer if necessary.

Signed-off-by: Jason Chen CJ <jason.cj.chen@...>
Signed-off-by: Yu Wang <yu1.wang@...>
Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/common/sched_iorr.c | 47
++++++++++++++++++++++++++++++++++++++----
1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/hypervisor/common/sched_iorr.c
b/hypervisor/common/sched_iorr.c index d6d0ad9..6b12d01 100644
--- a/hypervisor/common/sched_iorr.c
+++ b/hypervisor/common/sched_iorr.c
@@ -121,17 +121,56 @@ void sched_iorr_init_data(struct thread_object
*obj)
data->left_cycles = data->slice_cycles = CONFIG_SLICE_MS *
CYCLES_PER_MS; }

-static struct thread_object *sched_iorr_pick_next(__unused struct
sched_control *ctl)
+static struct thread_object *sched_iorr_pick_next(struct sched_control
+*ctl)
{
- return NULL;
+ struct sched_iorr_control *iorr_ctl = (struct sched_iorr_control
*)ctl->priv;
+ struct thread_object *next = NULL;
+ struct thread_object *current = NULL;
+ struct sched_iorr_data *data;
+ uint64_t now = rdtsc();
+
+ current = ctl->curr_obj;
+ data = (struct sched_iorr_data *)current->data;
+ /* Ignore the idle object, inactive objects */
+ if (!is_idle_thread(current) && is_inqueue(current)) {
OK, we may have different understand to idle thread... IMO, idle thread is a normal thread, inside runqueue, but with least priority. Seems it is not in run queue in your proposal.
Yes. idle is not in runqueue.

What is the tradeoff here?
As the basic policy is round-robin, we put the runnable threads in
runqueue except idle and pick them with round-robin policy. It is
straightforward. And the idle thread has per-cpu var, so we can get it
directly if runqueue is empty.


+ data->left_cycles -= now - data->last_cycles;
+ if (data->left_cycles <= 0) {
+ /* replenish thread_object with slice_cycles */
+ data->left_cycles += data->slice_cycles;
+ }
+ /* move the thread_object to tail */
+ runqueue_remove(current);
+ runqueue_add_tail(current);
+ }
+
+ /*
+ * Pick the next runnable sched object
+ * 1) get the first item in runqueue firstly
+ * 2) if object picked has no time_cycles, replenish it pick this one
+ * 3) At least take one idle sched object if we have no runnable one after
step 1) and 2)
+ */
+ if (!list_empty(&iorr_ctl->runqueue)) {
+ next = get_first_item(&iorr_ctl->runqueue, struct thread_object,
data);
+ data = (struct sched_iorr_data *)next->data;
+ data->last_cycles = now;
+ while (data->left_cycles <= 0) {
+ data->left_cycles += data->slice_cycles;
+ }
I can understand we may have to add one slice. But why we add unlimited slice till it becomes + ?
Yes. Because if we have one - slice thread on runqueue, it will be
eventually replenished to + after several ticks, and request reschedule
every tick even there is only one runnable thread on runqueue becase -
slice.

In my understanding, a thread runs with a time slice, and all the way to be <0. Once it becomes < 0, we should trigger a scheduling (and hence it is only a little below 0).
Yes. Right. But in some cases, like thread A <0 time slice, thread B
yield the cpu, so thread A get running again with <0 time slice. It will
minus more timeslice and a bit more below 0.


+ } else {
+ next = &get_cpu_var(idle);
+ }
+
+ return next;
}

-static void sched_iorr_sleep(__unused struct thread_object *obj)
+static void sched_iorr_sleep(struct thread_object *obj)
{
+ runqueue_remove(obj);
If thread A wants to sleep thread B, this API is fine.

If thread A wants to sleep itself, this API may be bogus. Right? We need to pick next and trigger a switch to next...
Yes. Func sleep_thread in schedule framework does it. Here, it just
operate its runqueue to make sure the next pick_next could get the
right thread to run.

}

-static void sched_iorr_wake(__unused struct thread_object *obj)
+static void sched_iorr_wake(struct thread_object *obj)
{
+ runqueue_add_head(obj);
}

struct acrn_scheduler sched_iorr = {
--
2.8.3



Re: [PATCH 0/7] Use NMI to nofity vCPUs with lapic-pt

Grandhi, Sainath
 

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Kaige Fu
Sent: Friday, December 06, 2019 5:48 AM
To: acrn-dev@...
Subject: [acrn-dev] [PATCH 0/7] Use NMI to nofity vCPUs with lapic-pt

ACRN hypervisor needs to kick vCPU off VMX non-root mode to do some
operations in hypervisor, such as interrupt/exception injection, EPT flush etc. For
non lapic-pt vCPUs, we can use IPI to do so. But, it doesn't work for lapic-pt
vCPUs as the IPI will be injected to VMs directly without vmexit.

Consequently, there may be fatal errors triggered. 1). Certain operation may not
be carried out on time which may further lead to fatal errors. Taking the EPT
flush request as an example, once we don't flush the EPT on time and the guest
access the out-of-date EPT, fatal error happens. 2). The IPI vector will be
delivered to VMs directly. If the guest can't handle it properly, further interrupts
might be blocked which will cause the VMs hang.

The NMI can be used as the notification signal to kick the vCPU off VMX non-
root mode for lapic-pt vCPUs. This patchset does it by enable NMI-exiting after
passthroughing the lapic to vCPU.

TODOs:
- Filter out all NMI sources:
* Write ICR with deliver mode NMI
* Program the MSI data with deliver mode NMI
* Program the LVTs with deliver mode NMI
- Implement the smp_call for lapic-pt VMs to facilitate the debug of lapic-pt
VMs.

Kaige Fu (7):
HV: Push NMI vector on to the exception stack
HV: Add helper function send_single_nmi
HV: Use NMI to kick lapic-pt vCPU's thread
HV: ignore the NMI injection request for lapic-pt vCPUs
HV: Use NMI-window exiting to address req missing issue
Hi Kaige,
This patch " Use NMI-window exiting to address req missing issue" has nothing to do LAPIC PT and "NMI being used for notification". Is that right?
HV: Use NMI to replace INIT signal for lapic-pt VMs S5
HV: Remove INIT signal notification related code

hypervisor/arch/x86/guest/virq.c | 76 ++++++++++++++++++++--------
hypervisor/arch/x86/guest/vmcs.c | 18 +++++--
hypervisor/arch/x86/guest/vmexit.c | 23 +--------
hypervisor/arch/x86/idt.S | 6 +--
hypervisor/arch/x86/irq.c | 55 ++++++++++++++++----
hypervisor/arch/x86/lapic.c | 9 +---
hypervisor/common/schedule.c | 24 ++++++---
hypervisor/include/arch/x86/irq.h | 1 +
hypervisor/include/arch/x86/lapic.h | 4 +-
hypervisor/include/common/schedule.h | 4 +-
10 files changed, 146 insertions(+), 74 deletions(-)

--
2.20.0



Re: [PATCH v2 3/9] hv: sched_iorr: add tick handler and runqueue operations

Shuo A Liu
 

On Thu 5.Dec'19 at 18:03:17 +0800, Dong, Eddie wrote:


-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 3/9] hv: sched_iorr: add tick handler and
runqueue operations

sched_control is per-pcpu, each sched_control has a tick timer running
periodically. Every period called a tick. In tick handler, we do
1) compute left timeslice of current thread_object if it's not the idle
2) make a schedule request if current thread_object run out of timeslice

For runqueue maintaining, we will keep objects which has timeslice in the
front of runqueue and the ones get new replenished in tail.

Signed-off-by: Jason Chen CJ <jason.cj.chen@...>
Signed-off-by: Yu Wang <yu1.wang@...>
Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/common/sched_iorr.c | 61
+++++++++++++++++++++++++++++++++++++++++-
1 file changed, 60 insertions(+), 1 deletion(-)

diff --git a/hypervisor/common/sched_iorr.c
b/hypervisor/common/sched_iorr.c index d0b5ce7..d6d0ad9 100644
--- a/hypervisor/common/sched_iorr.c
+++ b/hypervisor/common/sched_iorr.c
@@ -18,8 +18,67 @@ struct sched_iorr_data {
int64_t left_cycles;
};

-static void sched_tick_handler(__unused void *param)
+bool is_inqueue(struct thread_object *obj)
{
+ struct sched_iorr_data *data = (struct sched_iorr_data *)obj->data;
+ return !list_empty(&data->list);
+}
+
+void runqueue_add_head(struct thread_object *obj) {
+ struct sched_iorr_control *iorr_ctl = (struct sched_iorr_control
*)obj->sched_ctl->priv;
+ struct sched_iorr_data *data = (struct sched_iorr_data *)obj->data;
+
+ if (!is_inqueue(obj)) {
+ list_add(&data->list, &iorr_ctl->runqueue);
+ }
+}
+
+void runqueue_add_tail(struct thread_object *obj) {
Here we assume obj is already initialized with proper obj->sched_ctrl, and priv...
Add @pre ?
OK. Will add

@pre obj != NULL
@pre obj->sched_ctl != NULL
@pre obj->sched_ctl->priv != NULL


Same for other APIs?
Yes. Will revisit them and add the @pre.



+ struct sched_iorr_control *iorr_ctl = (struct sched_iorr_control
*)obj->sched_ctl->priv;
+ struct sched_iorr_data *data = (struct sched_iorr_data *)obj->data;
+
+ if (!is_inqueue(obj)) {
+ list_add_tail(&data->list, &iorr_ctl->runqueue);
+ }
+}
+
+void runqueue_remove(struct thread_object *obj) {
+ struct sched_iorr_data *data = (struct sched_iorr_data *)obj->data;
+ list_del_init(&data->list);
+}
+
+static void sched_tick_handler(void *param) {
+ struct sched_control *ctl = (struct sched_control *)param;
+ struct sched_iorr_control *iorr_ctl = (struct sched_iorr_control
*)ctl->priv;
+ struct sched_iorr_data *data;
+ struct thread_object *current;
+ uint16_t pcpu_id = get_pcpu_id();
+ uint64_t now = rdtsc();
+ uint64_t rflags;
+
+ obtain_schedule_lock(pcpu_id, &rflags);
+ current = ctl->curr_obj;
+
+ /* If no vCPU start scheduling, ignore this tick */
+ if (current == NULL || (is_idle_thread(current) &&
list_empty(&iorr_ctl->runqueue))) {
In which case, current may be NULL ? Can we assume we always have idle thread?
In the early stage, timer is started we havn't launch idle thread, it is
NULL. We need check it for such case.


+ release_schedule_lock(pcpu_id, rflags);
+ return;
Does MISRAC allow return here?
Will refine the code to satisfy MISRAC.


+ }
+ data = (struct sched_iorr_data *)current->data;
+ /* consume the left_cycles of current thread_object if it is not idle */
+ if (!is_idle_thread(current)) {
+ data->left_cycles -= now - data->last_cycles;
+ data->last_cycles = now;
+ }
+ /* make reschedule request if current ran out of its cycles */
+ if (is_idle_thread(current) || data->left_cycles <= 0) {
+ make_reschedule_request(pcpu_id, DEL_MODE_IPI);
+ }
+ release_schedule_lock(pcpu_id, rflags);
}

/*
--
2.8.3



[PATCH 7/7] HV: Remove INIT signal notification related code

Kaige Fu
 

We don't use INIT signal notification method now. This patch
removes them.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/guest/vmexit.c | 21 +--------------------
hypervisor/arch/x86/lapic.c | 20 --------------------
hypervisor/common/schedule.c | 5 +----
hypervisor/include/arch/x86/lapic.h | 9 ---------
hypervisor/include/common/schedule.h | 4 +---
5 files changed, 3 insertions(+), 56 deletions(-)

diff --git a/hypervisor/arch/x86/guest/vmexit.c b/hypervisor/arch/x86/guest/vmexit.c
index 459cc430..6e6ec84c 100644
--- a/hypervisor/arch/x86/guest/vmexit.c
+++ b/hypervisor/arch/x86/guest/vmexit.c
@@ -30,7 +30,6 @@ static int32_t unhandled_vmexit_handler(struct acrn_vcpu *vcpu);
static int32_t xsetbv_vmexit_handler(struct acrn_vcpu *vcpu);
static int32_t wbinvd_vmexit_handler(struct acrn_vcpu *vcpu);
static int32_t undefined_vmexit_handler(struct acrn_vcpu *vcpu);
-static int32_t init_signal_vmexit_handler(__unused struct acrn_vcpu *vcpu);

/* VM Dispatch table for Exit condition handling */
static const struct vm_exit_dispatch dispatch_table[NR_VMX_EXIT_REASONS] = {
@@ -41,7 +40,7 @@ static const struct vm_exit_dispatch dispatch_table[NR_VMX_EXIT_REASONS] = {
[VMX_EXIT_REASON_TRIPLE_FAULT] = {
.handler = triple_fault_vmexit_handler},
[VMX_EXIT_REASON_INIT_SIGNAL] = {
- .handler = init_signal_vmexit_handler},
+ .handler = undefined_vmexit_handler},
[VMX_EXIT_REASON_STARTUP_IPI] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_IO_SMI] = {
@@ -373,21 +372,3 @@ static int32_t undefined_vmexit_handler(struct acrn_vcpu *vcpu)
vcpu_inject_ud(vcpu);
return 0;
}
-
-/*
- * This handler is only triggered by INIT signal when poweroff from inside of RTVM
- */
-static int32_t init_signal_vmexit_handler(__unused struct acrn_vcpu *vcpu)
-{
- /*
- * Intel SDM Volume 3, 25.2:
- * INIT signals. INIT signals cause VM exits. A logical processer performs none
- * of the operations normally associated with these events. Such exits do not modify
- * register state or clear pending events as they would outside of VMX operation (If
- * a logical processor is the wait-for-SIPI state, INIT signals are blocked. They do
- * not cause VM exits in this case).
- *
- * So, it is safe to ignore the signal and reture here.
- */
- return 0;
-}
diff --git a/hypervisor/arch/x86/lapic.c b/hypervisor/arch/x86/lapic.c
index a54cb067..498a7972 100644
--- a/hypervisor/arch/x86/lapic.c
+++ b/hypervisor/arch/x86/lapic.c
@@ -269,26 +269,6 @@ void send_single_ipi(uint16_t pcpu_id, uint32_t vector)
}
}

-/**
- * @pre pcpu_id < CONFIG_MAX_PCPU_NUM
- *
- * @return None
- */
-void send_single_init(uint16_t pcpu_id)
-{
- union apic_icr icr;
-
- /*
- * Intel SDM Vol3 23.8:
- * The INIT signal is blocked whenever a logical processor is in VMX root operation.
- * It is not blocked in VMX nonroot operation. Instead, INITs cause VM exits
- */
- icr.value_32.hi_32 = per_cpu(lapic_id, pcpu_id);
- icr.value_32.lo_32 = (INTR_LAPIC_ICR_PHYSICAL << 11U) | (INTR_LAPIC_ICR_INIT << 8U);
-
- msr_write(MSR_IA32_EXT_APIC_ICR, icr.value);
-}
-
/**
* @pre pcpu_id < CONFIG_MAX_PCPU_NUM
*
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index b846863d..e5714c36 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -118,7 +118,7 @@ struct thread_object *sched_get_current(uint16_t pcpu_id)
}

/**
- * @pre delmode == DEL_MODE_IPI || delmode == DEL_MODE_INIT || delmode == DEL_MODE_NMI
+ * @pre delmode == DEL_MODE_IPI || delmode == DEL_MODE_NMI
*/
void make_reschedule_request(uint16_t pcpu_id, uint16_t delmode)
{
@@ -130,9 +130,6 @@ void make_reschedule_request(uint16_t pcpu_id, uint16_t delmode)
case DEL_MODE_IPI:
send_single_ipi(pcpu_id, VECTOR_NOTIFY_VCPU);
break;
- case DEL_MODE_INIT:
- send_single_init(pcpu_id);
- break;
case DEL_MODE_NMI:
send_single_nmi(pcpu_id);
break;
diff --git a/hypervisor/include/arch/x86/lapic.h b/hypervisor/include/arch/x86/lapic.h
index 9b59c21d..b345ff6b 100644
--- a/hypervisor/include/arch/x86/lapic.h
+++ b/hypervisor/include/arch/x86/lapic.h
@@ -174,15 +174,6 @@ void send_single_ipi(uint16_t pcpu_id, uint32_t vector);
*/
/* End of ipi_ext_apis */

-/**
- * @brief Send an INIT signal to a single pCPU
- *
- * @param[in] pcpu_id The id of destination physical cpu
- *
- * @return None
- */
-void send_single_init(uint16_t pcpu_id);
-
/**
* @brief Send an NMI signal to a single pCPU
*
diff --git a/hypervisor/include/common/schedule.h b/hypervisor/include/common/schedule.h
index 27aa4db7..473aff56 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -10,9 +10,8 @@

#define NEED_RESCHEDULE (1U)

-#define DEL_MODE_INIT (1U)
+#define DEL_MODE_NMI (1U)
#define DEL_MODE_IPI (2U)
-#define DEL_MODE_NMI (3U)

#define THREAD_DATA_SIZE (256U)

@@ -23,7 +22,6 @@ enum thread_object_state {
};

enum sched_notify_mode {
- SCHED_NOTIFY_INIT,
SCHED_NOTIFY_NMI,
SCHED_NOTIFY_IPI
};
--
2.20.0


[PATCH 6/7] HV: Use NMI to replace INIT signal for lapic-pt VMs S5

Kaige Fu
 

We have implemented a new notification method using NMI.
So replace the INIT notification method with the NMI one.
Then we can remove INIT notification related code later.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/guest/vmcs.c | 7 ++++---
hypervisor/common/schedule.c | 4 ++--
hypervisor/include/common/schedule.h | 1 +
3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hypervisor/arch/x86/guest/vmcs.c b/hypervisor/arch/x86/guest/vmcs.c
index adf8d7cd..143a53ea 100644
--- a/hypervisor/arch/x86/guest/vmcs.c
+++ b/hypervisor/arch/x86/guest/vmcs.c
@@ -612,10 +612,11 @@ void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu)
update_msr_bitmap_x2apic_passthru(vcpu);

/*
- * After passthroughing lapic to guest, we should use INIT signal to
- * notify vcpu thread instead of IPI
+ * After passthroughing lapic to guest, we should use NMI signal to
+ * notify vcpu thread instead of IPI. Because the IPI will be delivered
+ * the guest directly without vmexit.
*/
- vcpu->thread_obj.notify_mode = SCHED_NOTIFY_INIT;
+ vcpu->thread_obj.notify_mode = SCHED_NOTIFY_NMI;
} else {
value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS2);
value32 &= ~VMX_PROCBASED_CTLS2_VAPIC;
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 327d4cc6..b846863d 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -197,8 +197,8 @@ void sleep_thread(struct thread_object *obj)
scheduler->sleep(obj);
}
if (is_running(obj)) {
- if (obj->notify_mode == SCHED_NOTIFY_INIT) {
- make_reschedule_request(pcpu_id, DEL_MODE_INIT);
+ if (obj->notify_mode == SCHED_NOTIFY_NMI) {
+ make_reschedule_request(pcpu_id, DEL_MODE_NMI);
} else {
make_reschedule_request(pcpu_id, DEL_MODE_IPI);
}
diff --git a/hypervisor/include/common/schedule.h b/hypervisor/include/common/schedule.h
index 0a407fb1..27aa4db7 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -24,6 +24,7 @@ enum thread_object_state {

enum sched_notify_mode {
SCHED_NOTIFY_INIT,
+ SCHED_NOTIFY_NMI,
SCHED_NOTIFY_IPI
};

--
2.20.0


[PATCH 5/7] HV: Use NMI-window exiting to address req missing issue

Kaige Fu
 

There is a window where we may miss the current request in the
notification period when the work flow is as the following:

CPUx + + CPUr
| |
| +--+
| | | Handle pending req
| <--+
+--+ |
| | Set req flag |
<--+ |
+------------------>---+
| Send NMI | | Handle NMI
| <--+
| |
| |
| +--> vCPU enter
| |
+ +

So, this patch enables the NMI-window exiting to trigger the next vmexit
once there is no "virtual-NMI blocking" after vCPU enter into VMX non-root
mode. Then we can process the pending request on time.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/guest/virq.c | 21 ++++++++++++++++++
hypervisor/arch/x86/guest/vmexit.c | 2 +-
hypervisor/arch/x86/irq.c | 35 +++++++++++++++++++++++++++++-
hypervisor/include/arch/x86/irq.h | 1 +
4 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/hypervisor/arch/x86/guest/virq.c b/hypervisor/arch/x86/guest/virq.c
index 1c9904c4..784e119b 100644
--- a/hypervisor/arch/x86/guest/virq.c
+++ b/hypervisor/arch/x86/guest/virq.c
@@ -527,3 +527,24 @@ int32_t exception_vmexit_handler(struct acrn_vcpu *vcpu)

return status;
}
+
+int nmi_window_vmexit_handler(__unused struct acrn_vcpu *vcpu)
+{
+ uint32_t value32;
+
+ /*
+ * Disable NMI-window exiting here. We will process
+ * the pending request in acrn_handle_pending_request later
+ */
+ value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS);
+ value32 &= ~VMX_PROCBASED_CTLS_NMI_WINEXIT;
+ exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32);
+
+ value32 = exec_vmread32(VMX_PIN_VM_EXEC_CONTROLS);
+ value32 &= ~VMX_PINBASED_CTLS_VIRT_NMI;
+ exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32);
+
+ vcpu_retain_rip(vcpu);
+
+ return 0;
+}
diff --git a/hypervisor/arch/x86/guest/vmexit.c b/hypervisor/arch/x86/guest/vmexit.c
index efc59e67..459cc430 100644
--- a/hypervisor/arch/x86/guest/vmexit.c
+++ b/hypervisor/arch/x86/guest/vmexit.c
@@ -51,7 +51,7 @@ static const struct vm_exit_dispatch dispatch_table[NR_VMX_EXIT_REASONS] = {
[VMX_EXIT_REASON_INTERRUPT_WINDOW] = {
.handler = interrupt_window_vmexit_handler},
[VMX_EXIT_REASON_NMI_WINDOW] = {
- .handler = unhandled_vmexit_handler},
+ .handler = nmi_window_vmexit_handler},
[VMX_EXIT_REASON_TASK_SWITCH] = {
.handler = unhandled_vmexit_handler},
[VMX_EXIT_REASON_CPUID] = {
diff --git a/hypervisor/arch/x86/irq.c b/hypervisor/arch/x86/irq.c
index 4460b44d..067ba706 100644
--- a/hypervisor/arch/x86/irq.c
+++ b/hypervisor/arch/x86/irq.c
@@ -18,6 +18,7 @@
#include <vboot.h>
#include <dump.h>
#include <logmsg.h>
+#include <vmx.h>

static spinlock_t exception_spinlock = { .head = 0U, .tail = 0U, };
static spinlock_t irq_alloc_spinlock = { .head = 0U, .tail = 0U, };
@@ -368,9 +369,41 @@ void dispatch_interrupt(const struct intr_excp_ctx *ctx)
void dispatch_exception(struct intr_excp_ctx *ctx)
{
uint16_t pcpu_id = get_pcpu_id();
+ uint32_t value32;

if (ctx->vector == IDT_NMI) {
- /* TODO: we will handle it later */
+ /*
+ * There is a window where we may miss the current request in this
+ * notification period when the work flow is as the following:
+ *
+ * CPUx + + CPUr
+ * | |
+ * | +--+
+ * | | | Handle pending req
+ * | <--+
+ * +--+ |
+ * | | Set req flag |
+ * <--+ |
+ * +------------------>---+
+ * | Send NMI | | Handle NMI
+ * | <--+
+ * | |
+ * | |
+ * | +--> vCPU enter
+ * | |
+ * + +
+ *
+ * So, here we enable the NMI-window exiting to trigger the next vmexit
+ * once there is no "virtual-NMI blocking" after vCPU enter into VMX non-root
+ * mode. Then we can process the pending request on time.
+ */
+ value32 = exec_vmread32(VMX_PROC_VM_EXEC_CONTROLS);
+ value32 |= VMX_PROCBASED_CTLS_NMI_WINEXIT;
+ exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS, value32);
+
+ value32 = exec_vmread32(VMX_PIN_VM_EXEC_CONTROLS);
+ value32 |= VMX_PINBASED_CTLS_VIRT_NMI;
+ exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32);
} else {
/* Obtain lock to ensure exception dump doesn't get corrupted */
spinlock_obtain(&exception_spinlock);
diff --git a/hypervisor/include/arch/x86/irq.h b/hypervisor/include/arch/x86/irq.h
index 8f8f3967..a5ed3aed 100644
--- a/hypervisor/include/arch/x86/irq.h
+++ b/hypervisor/include/arch/x86/irq.h
@@ -207,6 +207,7 @@ void vcpu_make_request(struct acrn_vcpu *vcpu, uint16_t eventid);
* @pre vcpu != NULL
*/
int32_t exception_vmexit_handler(struct acrn_vcpu *vcpu);
+int32_t nmi_window_vmexit_handler(struct acrn_vcpu *vcpu);
int32_t interrupt_window_vmexit_handler(struct acrn_vcpu *vcpu);
int32_t external_interrupt_vmexit_handler(struct acrn_vcpu *vcpu);
int32_t acrn_handle_pending_request(struct acrn_vcpu *vcpu);
--
2.20.0


[PATCH 4/7] HV: ignore the NMI injection request for lapic-pt vCPUs

Kaige Fu
 

NMI will be used as notification signal for lapic-pt vCPUs and
we don't support vNMI yet. So, this patch ignore the pending NMI
request and EXCP with vector 2.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/guest/virq.c | 55 ++++++++++++++++++++------------
1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/hypervisor/arch/x86/guest/virq.c b/hypervisor/arch/x86/guest/virq.c
index 60358534..1c9904c4 100644
--- a/hypervisor/arch/x86/guest/virq.c
+++ b/hypervisor/arch/x86/guest/virq.c
@@ -211,26 +211,36 @@ int32_t vcpu_queue_exception(struct acrn_vcpu *vcpu, uint32_t vector_arg, uint32
static void vcpu_inject_exception(struct acrn_vcpu *vcpu, uint32_t vector)
{
if (bitmap_test_and_clear_lock(ACRN_REQUEST_EXCP, &vcpu->arch.pending_req)) {
-
- if ((exception_type[vector] & EXCEPTION_ERROR_CODE_VALID) != 0U) {
- exec_vmwrite32(VMX_ENTRY_EXCEPTION_ERROR_CODE,
- vcpu->arch.exception_info.error);
- }
+ if (is_lapic_pt_enabled(vcpu) && (vector == IDT_NMI)) {
+ /*
+ * NMI will be used as notification signal for lapic-pt vCPUs and we
+ * don't support vNMI yet. So, here we just ignore the NMI injection
+ * request.
+ */
+ pr_warn("Don't allow to inject NMI to lapic-pt vCPU%u. Ignore this request.", vcpu->vcpu_id);
+ vcpu->arch.exception_info.exception = VECTOR_INVALID;
+ } else {
+ if ((exception_type[vector] & EXCEPTION_ERROR_CODE_VALID) != 0U) {
+ exec_vmwrite32(VMX_ENTRY_EXCEPTION_ERROR_CODE,
+ vcpu->arch.exception_info.error);
+ }

- exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD, VMX_INT_INFO_VALID |
- (exception_type[vector] << 8U) | (vector & 0xFFU));
+ exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD, VMX_INT_INFO_VALID |
+ (exception_type[vector] << 8U) | (vector & 0xFFU));

- vcpu->arch.exception_info.exception = VECTOR_INVALID;
+ vcpu->arch.exception_info.exception = VECTOR_INVALID;

- /* retain rip for exception injection */
- vcpu_retain_rip(vcpu);
+ /* retain rip for exception injection */
+ vcpu_retain_rip(vcpu);

- /* SDM 17.3.1.1 For any fault-class exception except a debug exception generated in response to an
- * instruction breakpoint, the value pushed for RF is 1.
- * #DB is treated as Trap in get_exception_type, so RF will not be set for instruction breakpoint.
- */
- if (get_exception_type(vector) == EXCEPTION_FAULT) {
- vcpu_set_rflags(vcpu, vcpu_get_rflags(vcpu) | HV_ARCH_VCPU_RFLAGS_RF);
+ /* SDM 17.3.1.1 For any fault-class exception except a debug exception generated
+ * in response to an instruction breakpoint, the value pushed for RF is 1.
+ * #DB is treated as Trap in get_exception_type, so RF will not be set for
+ * instruction breakpoint.
+ */
+ if (get_exception_type(vector) == EXCEPTION_FAULT) {
+ vcpu_set_rflags(vcpu, vcpu_get_rflags(vcpu) | HV_ARCH_VCPU_RFLAGS_RF);
+ }
}
}
}
@@ -383,10 +393,15 @@ int32_t acrn_handle_pending_request(struct acrn_vcpu *vcpu)
if (!injected) {
/* inject NMI before maskable hardware interrupt */
if (bitmap_test_and_clear_lock(ACRN_REQUEST_NMI, pending_req_bits)) {
- /* Inject NMI vector = 2 */
- exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD,
- VMX_INT_INFO_VALID | (VMX_INT_TYPE_NMI << 8U) | IDT_NMI);
- injected = true;
+ if (!is_lapic_pt_enabled(vcpu)) {
+ /* Inject NMI vector = 2 */
+ exec_vmwrite32(VMX_ENTRY_INT_INFO_FIELD,
+ VMX_INT_INFO_VALID | (VMX_INT_TYPE_NMI << 8U) | IDT_NMI);
+ injected = true;
+ } else {
+ pr_warn("Don't allow to inject NMI to lapic-pt vCPU%u. Ignore this request.",
+ vcpu->vcpu_id);
+ }
} else {
/* handling pending vector injection:
* there are many reason inject failed, we need re-inject again
--
2.20.0


[PATCH 3/7] HV: Use NMI to kick lapic-pt vCPU's thread

Kaige Fu
 

ACRN hypervisor needs to kick vCPU off VMX non-root mode to do some
operations in hypervisor, such as interrupt/exception injection, EPT
flush etc. For non lapic-pt vCPUs, we can use IPI to do so. But, it
doesn't work for lapic-pt vCPUs as the IPI will be injected to VMs
directly without vmexit.

There may be fatal errors triggered. 1). Certain operation may not be
carried out on time which may further lead to fatal errors. Taking the
EPT flush request as an example, once we don't flush the EPT on time and
the guest access the out-of-date EPT, fatal error happens. 2). The IPI
vector will be delivered to VMs directly. If the guest can't handle it
properly, further interrupts might be blocked which will cause the VMs
hang.

The NMI can be used as the notification signal to kick the vCPU off VMX
non-root mode for lapic-pt vCPUs. This patch does it by enable NMI-exiting
after passthroughing the lapic to vCPU.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/guest/vmcs.c | 11 +++++++++++
hypervisor/arch/x86/irq.c | 20 ++++++++++++--------
hypervisor/common/schedule.c | 19 ++++++++++++++++---
hypervisor/include/common/schedule.h | 1 +
4 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/hypervisor/arch/x86/guest/vmcs.c b/hypervisor/arch/x86/guest/vmcs.c
index 2bad083e..adf8d7cd 100644
--- a/hypervisor/arch/x86/guest/vmcs.c
+++ b/hypervisor/arch/x86/guest/vmcs.c
@@ -568,6 +568,7 @@ void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu)
* Disable posted interrupt processing
* update x2apic msr bitmap for pass-thru
* enable inteception only for ICR
+ * enable NMI exit as we will use NMI to kick vCPU thread
* disable pre-emption for TSC DEADLINE MSR
* Disable Register Virtualization and virtual interrupt delivery
* Disable "use TPR shadow"
@@ -578,6 +579,16 @@ void switch_apicv_mode_x2apic(struct acrn_vcpu *vcpu)
if (is_apicv_advanced_feature_supported()) {
value32 &= ~VMX_PINBASED_CTLS_POST_IRQ;
}
+
+ /*
+ * ACRN hypervisor needs to kick vCPU off VMX non-root mode to do some
+ * operations in hypervisor, such as interrupt/exception injection, EPT
+ * flush etc. For non lapic-pt vCPUs, we can use IPI to do so. But, it
+ * doesn't work for lapic-pt vCPUs as the IPI will be injected to VMs
+ * directly without vmexit. So, here we enable NMI-exiting and use NMI
+ * as notification signal after passthroughing the lapic to vCPU.
+ */
+ value32 |= VMX_PINBASED_CTLS_NMI_EXIT;
exec_vmwrite32(VMX_PIN_VM_EXEC_CONTROLS, value32);

value32 = exec_vmread32(VMX_EXIT_CONTROLS);
diff --git a/hypervisor/arch/x86/irq.c b/hypervisor/arch/x86/irq.c
index 6a55418c..4460b44d 100644
--- a/hypervisor/arch/x86/irq.c
+++ b/hypervisor/arch/x86/irq.c
@@ -369,17 +369,21 @@ void dispatch_exception(struct intr_excp_ctx *ctx)
{
uint16_t pcpu_id = get_pcpu_id();

- /* Obtain lock to ensure exception dump doesn't get corrupted */
- spinlock_obtain(&exception_spinlock);
+ if (ctx->vector == IDT_NMI) {
+ /* TODO: we will handle it later */
+ } else {
+ /* Obtain lock to ensure exception dump doesn't get corrupted */
+ spinlock_obtain(&exception_spinlock);

- /* Dump exception context */
- dump_exception(ctx, pcpu_id);
+ /* Dump exception context */
+ dump_exception(ctx, pcpu_id);

- /* Release lock to let other CPUs handle exception */
- spinlock_release(&exception_spinlock);
+ /* Release lock to let other CPUs handle exception */
+ spinlock_release(&exception_spinlock);

- /* Halt the CPU */
- cpu_dead();
+ /* Halt the CPU */
+ cpu_dead();
+ }
}

static void init_irq_descs(void)
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 847bafd0..327d4cc6 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -118,7 +118,7 @@ struct thread_object *sched_get_current(uint16_t pcpu_id)
}

/**
- * @pre delmode == DEL_MODE_IPI || delmode == DEL_MODE_INIT
+ * @pre delmode == DEL_MODE_IPI || delmode == DEL_MODE_INIT || delmode == DEL_MODE_NMI
*/
void make_reschedule_request(uint16_t pcpu_id, uint16_t delmode)
{
@@ -133,6 +133,9 @@ void make_reschedule_request(uint16_t pcpu_id, uint16_t delmode)
case DEL_MODE_INIT:
send_single_init(pcpu_id);
break;
+ case DEL_MODE_NMI:
+ send_single_nmi(pcpu_id);
+ break;
default:
ASSERT(false, "Unknown delivery mode %u for pCPU%u", delmode, pcpu_id);
break;
@@ -230,10 +233,20 @@ void kick_thread(const struct thread_object *obj)
obtain_schedule_lock(pcpu_id, &rflag);
if (is_running(obj)) {
if (get_pcpu_id() != pcpu_id) {
- send_single_ipi(pcpu_id, VECTOR_NOTIFY_VCPU);
+ if (obj->notify_mode == SCHED_NOTIFY_IPI) {
+ send_single_ipi(pcpu_id, VECTOR_NOTIFY_VCPU);
+ } else {
+ /* For lapic-pt vCPUs */
+ send_single_nmi(pcpu_id);
+ }
}
} else if (is_runnable(obj)) {
- make_reschedule_request(pcpu_id, DEL_MODE_IPI);
+ if (obj->notify_mode == SCHED_NOTIFY_IPI) {
+ make_reschedule_request(pcpu_id, DEL_MODE_IPI);
+ } else {
+ /* For lapic-pt vCPUs */
+ make_reschedule_request(pcpu_id, DEL_MODE_NMI);
+ }
} else {
/* do nothing */
}
diff --git a/hypervisor/include/common/schedule.h b/hypervisor/include/common/schedule.h
index 808beacc..0a407fb1 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -12,6 +12,7 @@

#define DEL_MODE_INIT (1U)
#define DEL_MODE_IPI (2U)
+#define DEL_MODE_NMI (3U)

#define THREAD_DATA_SIZE (256U)

--
2.20.0


[PATCH 2/7] HV: Add helper function send_single_nmi

Kaige Fu
 

This patch adds a helper function send_single_nmi. The fisrt caller
will soon come with the following patch.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/lapic.c | 15 +++++++++++++++
hypervisor/include/arch/x86/lapic.h | 9 +++++++++
2 files changed, 24 insertions(+)

diff --git a/hypervisor/arch/x86/lapic.c b/hypervisor/arch/x86/lapic.c
index c06daaab..a54cb067 100644
--- a/hypervisor/arch/x86/lapic.c
+++ b/hypervisor/arch/x86/lapic.c
@@ -288,3 +288,18 @@ void send_single_init(uint16_t pcpu_id)

msr_write(MSR_IA32_EXT_APIC_ICR, icr.value);
}
+
+/**
+ * @pre pcpu_id < CONFIG_MAX_PCPU_NUM
+ *
+ * @return None
+ */
+void send_single_nmi(uint16_t pcpu_id)
+{
+ union apic_icr icr;
+
+ icr.value_32.hi_32 = per_cpu(lapic_id, pcpu_id);
+ icr.value_32.lo_32 = (INTR_LAPIC_ICR_PHYSICAL << 11U) | (INTR_LAPIC_ICR_NMI << 8U);
+
+ msr_write(MSR_IA32_EXT_APIC_ICR, icr.value);
+}
diff --git a/hypervisor/include/arch/x86/lapic.h b/hypervisor/include/arch/x86/lapic.h
index 5e490b8c..9b59c21d 100644
--- a/hypervisor/include/arch/x86/lapic.h
+++ b/hypervisor/include/arch/x86/lapic.h
@@ -183,4 +183,13 @@ void send_single_ipi(uint16_t pcpu_id, uint32_t vector);
*/
void send_single_init(uint16_t pcpu_id);

+/**
+ * @brief Send an NMI signal to a single pCPU
+ *
+ * @param[in] pcpu_id The id of destination physical cpu
+ *
+ * @return None
+ */
+void send_single_nmi(uint16_t pcpu_id);
+
#endif /* INTR_LAPIC_H */
--
2.20.0


[PATCH 1/7] HV: Push NMI vector on to the exception stack

Kaige Fu
 

This patch pushs the NMI vector (2) on to the exception stack.
So, we can get the right vector in dispatch_exception.

Signed-off-by: Kaige Fu <kaige.fu@...>
---
hypervisor/arch/x86/idt.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hypervisor/arch/x86/idt.S b/hypervisor/arch/x86/idt.S
index cb42ea9e..bd6d7c47 100644
--- a/hypervisor/arch/x86/idt.S
+++ b/hypervisor/arch/x86/idt.S
@@ -110,9 +110,9 @@ excp_debug:

.align 8
excp_nmi:
-
-
-
+ pushq $0x0
+ pushq $0x02 /* pseudo error code */
+ jmp excp_save_frame

.align 8
excp_breakpoint:
--
2.20.0


[PATCH 0/7] Use NMI to nofity vCPUs with lapic-pt

Kaige Fu
 

ACRN hypervisor needs to kick vCPU off VMX non-root mode to do some
operations in hypervisor, such as interrupt/exception injection, EPT
flush etc. For non lapic-pt vCPUs, we can use IPI to do so. But, it
doesn't work for lapic-pt vCPUs as the IPI will be injected to VMs
directly without vmexit.

Consequently, there may be fatal errors triggered. 1). Certain operation may not be
carried out on time which may further lead to fatal errors. Taking the
EPT flush request as an example, once we don't flush the EPT on time and
the guest access the out-of-date EPT, fatal error happens. 2). The IPI
vector will be delivered to VMs directly. If the guest can't handle it
properly, further interrupts might be blocked which will cause the VMs
hang.

The NMI can be used as the notification signal to kick the vCPU off VMX
non-root mode for lapic-pt vCPUs. This patchset does it by enable NMI-exiting
after passthroughing the lapic to vCPU.

TODOs:
- Filter out all NMI sources:
* Write ICR with deliver mode NMI
* Program the MSI data with deliver mode NMI
* Program the LVTs with deliver mode NMI
- Implement the smp_call for lapic-pt VMs to facilitate the debug of lapic-pt VMs.

Kaige Fu (7):
HV: Push NMI vector on to the exception stack
HV: Add helper function send_single_nmi
HV: Use NMI to kick lapic-pt vCPU's thread
HV: ignore the NMI injection request for lapic-pt vCPUs
HV: Use NMI-window exiting to address req missing issue
HV: Use NMI to replace INIT signal for lapic-pt VMs S5
HV: Remove INIT signal notification related code

hypervisor/arch/x86/guest/virq.c | 76 ++++++++++++++++++++--------
hypervisor/arch/x86/guest/vmcs.c | 18 +++++--
hypervisor/arch/x86/guest/vmexit.c | 23 +--------
hypervisor/arch/x86/idt.S | 6 +--
hypervisor/arch/x86/irq.c | 55 ++++++++++++++++----
hypervisor/arch/x86/lapic.c | 9 +---
hypervisor/common/schedule.c | 24 ++++++---
hypervisor/include/arch/x86/irq.h | 1 +
hypervisor/include/arch/x86/lapic.h | 4 +-
hypervisor/include/common/schedule.h | 4 +-
10 files changed, 146 insertions(+), 74 deletions(-)

--
2.20.0


Re: [PATCH v2 0/9] Add one scheduler which support cpu sharing: sched_iorr

Eddie Dong
 

Let us merge 1/2/3/4/5/8/9 first, after fixing the comments.
6/7 could be later.

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 0/9] Add one scheduler which support cpu
sharing: sched_iorr

This patchset will add one scheduler named sched_iorr which support cpu
sharing.
sched_iorr is short for IO sensitive Round-robin scheduler. It aims to schedule
threads with round-robin as basic policy, and be enhanced with some
faireness configuration such as preemption occurs on running out of timeslice,
IO request pending thread will be scheduled in high priority.

The patchset also add PAUSE-loop, HLT emulation support. HLT emulation is a
simple version for improving cpu sharing performance for now, which will be
improved later.

v2: Add @pre and assert for sched_iorr_init

Shuo A Liu (9):
hv: sched_iorr: Add IO sensitive Round-robin scheduler
hv: sched_iorr: add init functions of sched_iorr
hv: sched_iorr: add tick handler and runqueue operations
hv: sched_iorr: add some interfaces implementation of sched_iorr
hv: sched: add yield support
hv: sched: PAUSE-loop exiting support in hypervisor
hv: sched: simple HLT emulation in hypervisor
hv: sched: suspend/resume scheduling in suspend-to-ram path
hv: sched: use hypervisor configuration to choose scheduler

hypervisor/Makefile | 5 +
hypervisor/arch/x86/Kconfig | 21 ++++
hypervisor/arch/x86/guest/vmcs.c | 9 +-
hypervisor/arch/x86/guest/vmexit.c | 19 +++-
hypervisor/arch/x86/pm.c | 2 +
hypervisor/common/sched_iorr.c | 179
++++++++++++++++++++++++++++++++++
hypervisor/common/schedule.c | 28 ++++++
hypervisor/include/arch/x86/per_cpu.h | 1 +
hypervisor/include/common/schedule.h | 11 +++
9 files changed, 271 insertions(+), 4 deletions(-) create mode 100644
hypervisor/common/sched_iorr.c

--
2.8.3



Re: [PATCH v2 8/9] hv: sched: suspend/resume scheduling in suspend-to-ram path

Eddie Dong
 

Suspend/resume needs to be done for each AP...
BTW, if we don't do suspend/resume, seems it should be fine as if the timer if restored (physical TSC is restored).

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 8/9] hv: sched: suspend/resume scheduling in
suspend-to-ram path

Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/arch/x86/pm.c | 2 ++
hypervisor/common/schedule.c | 18 ++++++++++++++++++
hypervisor/include/common/schedule.h | 2 ++
3 files changed, 22 insertions(+)

diff --git a/hypervisor/arch/x86/pm.c b/hypervisor/arch/x86/pm.c index
bbce3c8..07b712e 100644
--- a/hypervisor/arch/x86/pm.c
+++ b/hypervisor/arch/x86/pm.c
@@ -194,6 +194,7 @@ void host_enter_s3(const struct pm_s_state_data
*sstate_data, uint32_t pm1a_cnt_
CPU_IRQ_DISABLE();
vmx_off();

+ suspend_sched();
suspend_console();
suspend_ioapic();
suspend_iommu();
@@ -225,6 +226,7 @@ void host_enter_s3(const struct pm_s_state_data
*sstate_data, uint32_t pm1a_cnt_

/* console must be resumed after TSC restored since it will setup timer
base on TSC */
resume_console();
+ resume_sched();
}

void reset_host(void)
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 681c17a..32a60bd 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -240,6 +240,24 @@ void kick_thread(const struct thread_object *obj)
release_schedule_lock(pcpu_id, rflag); }

+void suspend_sched(void)
+{
+ struct sched_control *ctl = &per_cpu(sched_ctl, get_pcpu_id());
+
+ if (ctl->scheduler->deinit != NULL) {
+ ctl->scheduler->deinit(ctl);
+ }
+}
+
+void resume_sched(void)
+{
+ struct sched_control *ctl = &per_cpu(sched_ctl, get_pcpu_id());
+
+ if (ctl->scheduler->init != NULL) {
+ ctl->scheduler->init(ctl);
+ }
+}
+
void yield(void)
{
make_reschedule_request(get_pcpu_id(), DEL_MODE_IPI); diff --git
a/hypervisor/include/common/schedule.h
b/hypervisor/include/common/schedule.h
index a237cdb..2753ef7 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -111,6 +111,8 @@ void wake_thread(struct thread_object *obj); void
kick_thread(const struct thread_object *obj); void yield(void); void
schedule(void);
+void suspend_sched(void);
+void resume_sched(void);

void arch_switch_to(void *prev_sp, void *next_sp); void
run_idle_thread(void);
--
2.8.3



Re: [PATCH v2 5/9] hv: sched: add yield support

Eddie Dong
 

Acked-by: Eddie Dong <eddie.dong@...>

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 5/9] hv: sched: add yield support

Add yield support for schedule, which can give up pcpu proactively.

Signed-off-by: Jason Chen CJ <jason.cj.chen@...>
Signed-off-by: Yu Wang <yu1.wang@...>
Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/common/schedule.c | 5 +++++
hypervisor/include/common/schedule.h | 1 +
2 files changed, 6 insertions(+)

diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 847bafd..681c17a 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -240,6 +240,11 @@ void kick_thread(const struct thread_object *obj)
release_schedule_lock(pcpu_id, rflag); }

+void yield(void)
+{
+ make_reschedule_request(get_pcpu_id(), DEL_MODE_IPI); }
+
void run_thread(struct thread_object *obj) {
uint64_t rflag;
diff --git a/hypervisor/include/common/schedule.h
b/hypervisor/include/common/schedule.h
index 5179b52..a237cdb 100644
--- a/hypervisor/include/common/schedule.h
+++ b/hypervisor/include/common/schedule.h
@@ -109,6 +109,7 @@ void run_thread(struct thread_object *obj); void
sleep_thread(struct thread_object *obj); void wake_thread(struct
thread_object *obj); void kick_thread(const struct thread_object *obj);
+void yield(void);
void schedule(void);

void arch_switch_to(void *prev_sp, void *next_sp);
--
2.8.3



Re: [PATCH v2 9/9] hv: sched: use hypervisor configuration to choose scheduler

Eddie Dong
 

Acked-by: Eddie Dong <eddie.dong@...>

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 9/9] hv: sched: use hypervisor configuration to
choose scheduler

For now, we set NOOP scheduler as default. User can choose IORR scheduler
as needed.

Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/Makefile | 4 ++++
hypervisor/arch/x86/Kconfig | 21 +++++++++++++++++++++
hypervisor/common/schedule.c | 5 +++++
3 files changed, 30 insertions(+)

diff --git a/hypervisor/Makefile b/hypervisor/Makefile index
fc19bb8..4a6569d 100644
--- a/hypervisor/Makefile
+++ b/hypervisor/Makefile
@@ -211,8 +211,12 @@ HW_C_SRCS += arch/x86/cat.c HW_C_SRCS +=
arch/x86/sgx.c HW_C_SRCS += common/softirq.c HW_C_SRCS +=
common/schedule.c
+ifeq ($(CONFIG_SCHED_NOOP),y)
HW_C_SRCS += common/sched_noop.c
+endif
+ifeq ($(CONFIG_SCHED_IORR),y)
HW_C_SRCS += common/sched_iorr.c
+endif
HW_C_SRCS += hw/pci.c
HW_C_SRCS += arch/x86/configs/vm_config.c HW_C_SRCS +=
arch/x86/configs/$(CONFIG_BOARD)/board.c
diff --git a/hypervisor/arch/x86/Kconfig b/hypervisor/arch/x86/Kconfig index
84bc7a0..12adced 100644
--- a/hypervisor/arch/x86/Kconfig
+++ b/hypervisor/arch/x86/Kconfig
@@ -37,6 +37,27 @@ config HYBRID

endchoice

+choice
+ prompt "ACRN Scheduler"
+ default SCHED_NOOP
+ help
+ Select the NOOP scheduler for hypervisor.
+ one vCPU running on one pCPU.
+
+config SCHED_NOOP
+ bool "NOOP scheduler"
+ help
+ The NOOP(No-Operation) scheduler only supports one vCPU running
on one pCPU.
+
+config SCHED_IORR
+ bool "IORR scheduler"
+ help
+ IORR (IO sensitive Round Robin) scheduler supports multipule
vCPUs running on
+ on one pCPU, and they will be scheduled by a IO sensitive round robin
policy.
+
+endchoice
+
+
config BOARD
string "Target board"
help
diff --git a/hypervisor/common/schedule.c b/hypervisor/common/schedule.c
index 32a60bd..71414e5 100644
--- a/hypervisor/common/schedule.c
+++ b/hypervisor/common/schedule.c
@@ -73,7 +73,12 @@ void init_sched(uint16_t pcpu_id)
ctl->flags = 0UL;
ctl->curr_obj = NULL;
ctl->pcpu_id = pcpu_id;
+#ifdef CONFIG_SCHED_NOOP
ctl->scheduler = &sched_noop;
+#endif
+#ifdef CONFIG_SCHED_IORR
+ ctl->scheduler = &sched_iorr;
+#endif
if (ctl->scheduler->init != NULL) {
ctl->scheduler->init(ctl);
}
--
2.8.3



Re: [PATCH v2 4/9] hv: sched_iorr: add some interfaces implementation of sched_iorr

Eddie Dong
 

-----Original Message-----
From: acrn-dev@... <acrn-dev@...> On
Behalf Of Shuo A Liu
Sent: Thursday, December 5, 2019 5:15 PM
To: acrn-dev@...
Cc: Wang, Yu1 <yu1.wang@...>; Chen, Jason CJ
<jason.cj.chen@...>; Chen, Conghui <conghui.chen@...>; Liu,
Shuo A <shuo.a.liu@...>
Subject: [acrn-dev] [PATCH v2 4/9] hv: sched_iorr: add some interfaces
implementation of sched_iorr

Implement .sleep/.wake/.pick_next of sched_iorr.
In .pick_next, we count current object's timeslice and pick the next avaiable
one. The policy is
1) get the first item in runqueue firstly
2) if object picked has no time_cycles, replenish it pick this one
3) At least take one idle sched object if we have no runnable object
after step 1) and 2)
In .wake, we start the tick if we have more than one active thread_object in
runqueue. In .sleep, stop the tick timer if necessary.

Signed-off-by: Jason Chen CJ <jason.cj.chen@...>
Signed-off-by: Yu Wang <yu1.wang@...>
Signed-off-by: Shuo A Liu <shuo.a.liu@...>
---
hypervisor/common/sched_iorr.c | 47
++++++++++++++++++++++++++++++++++++++----
1 file changed, 43 insertions(+), 4 deletions(-)

diff --git a/hypervisor/common/sched_iorr.c
b/hypervisor/common/sched_iorr.c index d6d0ad9..6b12d01 100644
--- a/hypervisor/common/sched_iorr.c
+++ b/hypervisor/common/sched_iorr.c
@@ -121,17 +121,56 @@ void sched_iorr_init_data(struct thread_object
*obj)
data->left_cycles = data->slice_cycles = CONFIG_SLICE_MS *
CYCLES_PER_MS; }

-static struct thread_object *sched_iorr_pick_next(__unused struct
sched_control *ctl)
+static struct thread_object *sched_iorr_pick_next(struct sched_control
+*ctl)
{
- return NULL;
+ struct sched_iorr_control *iorr_ctl = (struct sched_iorr_control
*)ctl->priv;
+ struct thread_object *next = NULL;
+ struct thread_object *current = NULL;
+ struct sched_iorr_data *data;
+ uint64_t now = rdtsc();
+
+ current = ctl->curr_obj;
+ data = (struct sched_iorr_data *)current->data;
+ /* Ignore the idle object, inactive objects */
+ if (!is_idle_thread(current) && is_inqueue(current)) {
OK, we may have different understand to idle thread... IMO, idle thread is a normal thread, inside runqueue, but with least priority. Seems it is not in run queue in your proposal.
What is the tradeoff here?

+ data->left_cycles -= now - data->last_cycles;
+ if (data->left_cycles <= 0) {
+ /* replenish thread_object with slice_cycles */
+ data->left_cycles += data->slice_cycles;
+ }
+ /* move the thread_object to tail */
+ runqueue_remove(current);
+ runqueue_add_tail(current);
+ }
+
+ /*
+ * Pick the next runnable sched object
+ * 1) get the first item in runqueue firstly
+ * 2) if object picked has no time_cycles, replenish it pick this one
+ * 3) At least take one idle sched object if we have no runnable one after
step 1) and 2)
+ */
+ if (!list_empty(&iorr_ctl->runqueue)) {
+ next = get_first_item(&iorr_ctl->runqueue, struct thread_object,
data);
+ data = (struct sched_iorr_data *)next->data;
+ data->last_cycles = now;
+ while (data->left_cycles <= 0) {
+ data->left_cycles += data->slice_cycles;
+ }
I can understand we may have to add one slice. But why we add unlimited slice till it becomes + ?
In my understanding, a thread runs with a time slice, and all the way to be <0. Once it becomes < 0, we should trigger a scheduling (and hence it is only a little below 0).

+ } else {
+ next = &get_cpu_var(idle);
+ }
+
+ return next;
}

-static void sched_iorr_sleep(__unused struct thread_object *obj)
+static void sched_iorr_sleep(struct thread_object *obj)
{
+ runqueue_remove(obj);
If thread A wants to sleep thread B, this API is fine.

If thread A wants to sleep itself, this API may be bogus. Right? We need to pick next and trigger a switch to next...
}

-static void sched_iorr_wake(__unused struct thread_object *obj)
+static void sched_iorr_wake(struct thread_object *obj)
{
+ runqueue_add_head(obj);
}

struct acrn_scheduler sched_iorr = {
--
2.8.3


11421 - 11440 of 37344