CPU Virtualization
QEMU 侧虚拟机的建立
QEMU 侧创建虚拟机比较简单,主要是在 kvm_init
中调用 ioctl(KVM_CREATE_VM)
,保存一个返回句柄 fd 在 KVM_State->vmfd
中,这样在其他地方也能使用这个句柄。QEMU 中使用 KVM_State
表示 KVM 相关的数据结构。
KVMState
老规矩,我们来看看这个重要的数据结构,
struct KVMState
{
AccelState parent_obj; // 因为 QEMU 能够支持 tcg, kvm 等不同加速方式,所以有个父类
int nr_slots; // 内存条,这些都是从 KVM 中获取的
int fd; // 表示内核中的 /dev/kvm 模块,内核要支持 KVM 才能进行下一步操作
int vmfd; // 这个句柄表示的就是 KVM 中创建的虚拟机
int coalesced_mmio; // 这个就是在 KVM 中分配页面专门用来做 mmio,访问到这个地址就会触发异常
int coalesced_pio;
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
bool coalesced_flush_in_progress;
int vcpu_events;
int robust_singlestep;
int debugregs;
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
#endif
int max_nested_state_len;
int many_ioeventfds;
int intx_set_mask;
int kvm_shadow_mem;
bool kernel_irqchip_allowed;
bool kernel_irqchip_required;
OnOffAuto kernel_irqchip_split;
bool sync_mmu;
uint64_t manual_dirty_log_protect;
/* The man page (and posix) say ioctl numbers are signed int, but
* they're not. Linux, glibc and *BSD all treat ioctl numbers as
* unsigned, and treating them as signed here can break things */
unsigned irq_set_ioctl;
unsigned int sigmask_len;
GHashTable *gsimap;
#ifdef KVM_CAP_IRQ_ROUTING
struct kvm_irq_routing *irq_routes;
int nr_allocated_irq_routes;
unsigned long *used_gsi_bitmap;
unsigned int gsi_count;
QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
#endif
KVMMemoryListener memory_listener;
QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
/* For "info mtree -f" to tell if an MR is registered in KVM */
int nr_as;
struct KVMAs {
KVMMemoryListener *ml;
AddressSpace *as;
} *as; // 该虚拟机的注册的内存可以用这里开始找到
// 不知道啥是dirty ring,j
uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */
uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */
struct KVMDirtyRingReaper reaper;
};
这是调用过程,
#0 kvm_init (ms=0x55555699a400) at ../accel/kvm/kvm-all.c:2307
#1 0x0000555555af387b in accel_init_machine (accel=0x55555681c140, ms=0x55555699a400) at ../accel/accel-softmmu.c:39
#2 0x0000555555c0752b in do_configure_accelerator (opaque=0x7fffffffda0d, opts=0x555556a448e0, errp=0x55555677c100 <error_fatal>) at ../softmmu/vl.c:2348
#3 0x0000555555f01307 in qemu_opts_foreach (list=0x5555566a18c0 <qemu_accel_opts>, func=0x555555c07401 <do_configure_accelerator>, opaque=0x7fffffffda0d, errp=0x55555677c100 <error_fatal>)
at ../util/qemu-option.c:1135
#4 0x0000555555c07790 in configure_accelerators (progname=0x7fffffffe0c5 "/home/guanshun/gitlab/qemu-newest/build/qemu-system-x86_64") at ../softmmu/vl.c:2414
#5 0x0000555555c0a834 in qemu_init (argc=13, argv=0x7fffffffdcc8, envp=0x7fffffffdd38) at ../softmmu/vl.c:3724 // qemu 的参数解析就是在这里完成的
#6 0x000055555583b6f5 in main (argc=13, argv=0x7fffffffdcc8, envp=0x7fffffffdd38) at ../softmmu/main.c:49
总的来说 kvm_init
在使用 KVM 的情况下是同时 ioctl
来确定 KVM 能够使用哪些功能,当然,最重要的还是确定 vmfd
。
KVM 侧虚拟机的建立
kvm_dev_ioctl
向用户层 QEMU 提供 ioctl
,根据不同类型的请求提供不同的服务,如果是 KVM_CREATE_VM
就调用 kvm_dev_ioctl_create_vm
创建一个虚拟机,然后返回一个文件描述符到用户态,QEMU 用该描述符操作虚拟机。
KVM 侧的实现之前没有详细分析,现在进一步分析。
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = kvm_dev_ioctl_create_vm(arg);
break;
case KVM_CHECK_EXTENSION:
r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
case KVM_TRACE_ENABLE:
case KVM_TRACE_PAUSE:
case KVM_TRACE_DISABLE:
r = -EOPNOTSUPP;
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
该函数的主要任务是调用 kvm_create_vm
创建虚拟机实例,每一个虚拟机实例用 kvm
结构体表示。
kvm
这里又涉及各种各样的情况,先记录下来,之后一点点补。
struct kvm {
... // 各种锁
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; // 每个虚拟机都有一个或多个 VCPU
/* Used to wait for completion of MMU notifiers. */
spinlock_t mn_invalidate_lock;
unsigned long mn_active_invalidate_count;
struct rcuwait mn_memslots_update_rcuwait;
/*
* created_vcpus is protected by kvm->lock, and is incremented
* at the beginning of KVM_CREATE_VCPU. online_vcpus is only
* incremented after storing the kvm_vcpu pointer in vcpus,
* and is accessed atomically.
*/
atomic_t online_vcpus;
int created_vcpus;
int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_EVENTFD
struct {
spinlock_t lock;
struct list_head items;
struct list_head resampler_list;
struct mutex resampler_lock;
} irqfds;
struct list_head ioeventfds;
#endif
struct kvm_vm_stat stat;
struct kvm_arch arch;
refcount_t users_count;
#ifdef CONFIG_KVM_MMIO
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
spinlock_t ring_lock;
struct list_head coalesced_zones;
#endif
struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
/*
* Update side is protected by irq_lock.
*/
struct kvm_irq_routing_table __rcu *irq_routing;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
struct hlist_head irq_ack_notifier_list;
#endif
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
unsigned long mmu_notifier_range_start;
unsigned long mmu_notifier_range_end;
#endif
struct list_head devices;
u64 manual_dirty_log_protect;
struct dentry *debugfs_dentry;
struct kvm_stat_data **debugfs_stat_data;
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
unsigned int max_halt_poll_ns;
u32 dirty_ring_size;
bool vm_bugged;
#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
struct notifier_block pm_notifier;
#endif
char stats_id[KVM_STATS_NAME_SIZE];
};
这里主要是调用 kvm_create_vm
函数来初始化各种结构体。
static int kvm_dev_ioctl_create_vm(unsigned long type)
{
int r;
struct kvm *kvm;
struct file *file;
kvm = kvm_create_vm(type);
if (IS_ERR(kvm))
return PTR_ERR(kvm);
#ifdef CONFIG_KVM_MMIO
r = kvm_coalesced_mmio_init(kvm); // 分配页面作为 mmio
if (r < 0)
goto put_kvm;
#endif
r = get_unused_fd_flags(O_CLOEXEC);
if (r < 0)
goto put_kvm;
snprintf(kvm->stats_id, sizeof(kvm->stats_id),
"kvm-%d", task_pid_nr(current));
file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
if (IS_ERR(file)) {
put_unused_fd(r);
r = PTR_ERR(file);
goto put_kvm;
}
...
return r;
put_kvm:
kvm_put_kvm(kvm); // 创建失败,直接销毁
return r;
}
kvm_create_vm
这个创建虚拟机的关键函数,包括相关成员变量的初始化以及开启 vmx 模式。当然不同架构部分变量不一样,所以还有 kvm_arch_init_vm
来初始化架构相关的部分。
static struct kvm *kvm_create_vm(unsigned long type)
{
struct kvm *kvm = kvm_arch_alloc_vm(); // 调用 vmalloc 分配 kvm 结构体内存
... // 初始化各种锁
refcount_set(&kvm->users_count, 1);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
struct kvm_memslots *slots = kvm_alloc_memslots(); // 为虚拟机分配内存槽
if (!slots)
goto out_err_no_arch_destroy_vm;
/* Generations must be different for each address space. */
slots->generation = i;
rcu_assign_pointer(kvm->memslots[i], slots);
}
for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
if (!kvm->buses[i])
goto out_err_no_arch_destroy_vm;
}
kvm->max_halt_poll_ns = halt_poll_ns;
r = kvm_arch_init_vm(kvm, type); // 架构相关的初始化
if (r)
goto out_err_no_arch_destroy_vm;
r = hardware_enable_all(); // 最终开启 VMX 模式
if (r)
goto out_err_no_disable;
#ifdef CONFIG_HAVE_KVM_IRQFD
INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif
r = kvm_init_mmu_notifier(kvm);
if (r)
goto out_err_no_mmu_notifier;
r = kvm_arch_post_init_vm(kvm);
if (r)
goto out_err;
...
return kvm;
...
}
static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
int r;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
/*
* This can happen if we hot-added a CPU but failed to allocate
* VP assist page for it.
*/
if (static_branch_unlikely(&enable_evmcs) &&
!hv_get_vp_assist_page(cpu))
return -EFAULT;
intel_pt_handle_vmx(1);
r = kvm_cpu_vmxon(phys_addr); // 设置 CR4_VMXE 位,同时调用 vmxon 开启 vmx
if (r) {
intel_pt_handle_vmx(0);
return r;
}
if (enable_ept) // 应该是打开 ept
ept_sync_global();
return 0;
}
QEMU CPU 的创建
前面两节讲的是整个虚拟机的建立,而虚拟机又可以使用多个 VCPU ,接下来分析 VCPU 的建立。
在 QOM 中已经介绍了 CPU 的类型初始化,对象实例初始化,但还需要对 CPU 对象进行具现化才能让 CPU 对象可以用。我对具现化的理解就是在 VCPU 上运行 GuestOS 。这一工作通过调用 x86_cpu_realizefn
来完成。
static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
{
CPUState *cs = CPU(dev);
X86CPU *cpu = X86_CPU(dev);
X86CPUClass *xcc = X86_CPU_GET_CLASS(dev);
CPUX86State *env = &cpu->env;
Error *local_err = NULL;
static bool ht_warned;
...
// 根据 QEMU 的命令行参数解析出来的 CPU 特性对 CPU 实例中的属性设置 TRUE 或 FALSE
x86_cpu_expand_features(cpu, &local_err);
if (local_err) {
goto out;
}
// 用来检测宿主机 CPU 特性能否支持创建的 CPU 对象
x86_cpu_filter_features(cpu, cpu->check_cpuid || cpu->enforce_cpuid);
...
/* Process Hyper-V enlightenments */
x86_cpu_hyperv_realize(cpu); // 用来初始化 CPU 实例中的 hyperv 相关的变量
cpu_exec_realizefn(cs, &local_err); // 调用 cpu_list_add 将正在初始化的 CPU 对象添加到一个全局链表 cpus 上
if (local_err != NULL) {
error_propagate(errp, local_err);
return;
}
#ifndef CONFIG_USER_ONLY
MachineState *ms = MACHINE(qdev_get_machine());
qemu_register_reset(x86_cpu_machine_reset_cb, cpu);
if (cpu->env.features[FEAT_1_EDX] & CPUID_APIC || ms->smp.cpus > 1) {
x86_cpu_apic_create(cpu, &local_err);
if (local_err != NULL) {
goto out;
}
}
#endif
mce_init(cpu); // 目前还不清楚这个机制是干嘛的
...
// 根据 QEMU 使用的加速器 cpus_accel->create_vcpu_thread(cpu);
// 来决定执行哪个初始化函数,加速方式在 accel 目录下
qemu_init_vcpu(cs);
...
x86_cpu_apic_realize(cpu, &local_err);
if (local_err != NULL) {
goto out;
}
cpu_reset(cs);
xcc->parent_realize(dev, &local_err);
out:
if (local_err != NULL) {
error_propagate(errp, local_err);
return;
}
}
具体的调用过程如下:
#0 qemu_thread_create (thread=0x5555569aa130, name=0x7fffffffd5e0 "CPU 0/KVM", start_routine=0x555555c84991 <kvm_vcpu_thread_fn>, arg=0x55555699d430, mode=0)
at ../util/qemu-thread-posix.c:529
#1 0x0000555555c84b8b in kvm_start_vcpu_thread (cpu=0x55555699d430) at ../accel/kvm/kvm-accel-ops.c:73
#2 0x0000555555c513b4 in qemu_init_vcpu (cpu=0x55555699d430) at ../softmmu/cpus.c:628
#3 0x0000555555b74759 in x86_cpu_realizefn (dev=0x55555699d430, errp=0x7fffffffd6e0) at ../target/i386/cpu.c:6910
#4 0x0000555555dfec89 in device_set_realized (obj=0x55555699d430, value=true, errp=0x7fffffffd7e8) at ../hw/core/qdev.c:761
#5 0x0000555555de5a4e in property_set_bool (obj=0x55555699d430, v=0x5555569a9f20, name=0x55555608c221 "realized", opaque=0x55555675d100, errp=0x7fffffffd7e8) at ../qom/object.c:2257
#6 0x0000555555de3a6f in object_property_set (obj=0x55555699d430, name=0x55555608c221 "realized", v=0x5555569a9f20, errp=0x5555566e9a98 <error_fatal>) at ../qom/object.c:1402
#7 0x0000555555ddf2a0 in object_property_set_qobject (obj=0x55555699d430, name=0x55555608c221 "realized", value=0x55555699c6d0, errp=0x5555566e9a98 <error_fatal>)
at ../qom/qom-qobject.c:28
#8 0x0000555555de3de7 in object_property_set_bool (obj=0x55555699d430, name=0x55555608c221 "realized", value=true, errp=0x5555566e9a98 <error_fatal>) at ../qom/object.c:1472
#9 0x0000555555dfdca9 in qdev_realize (dev=0x55555699d430, bus=0x0, errp=0x5555566e9a98 <error_fatal>) at ../hw/core/qdev.c:389
#10 0x0000555555b27071 in x86_cpu_new (x86ms=0x5555568b9de0, apic_id=0, errp=0x5555566e9a98 <error_fatal>) at ../hw/i386/x86.c:111
#11 0x0000555555b27144 in x86_cpus_init (x86ms=0x5555568b9de0, default_cpu_version=1) at ../hw/i386/x86.c:138
#12 0x0000555555b2047a in pc_init1 (machine=0x5555568b9de0, host_type=0x555556008544 "i440FX-pcihost", pci_type=0x55555600853d "i440FX") at ../hw/i386/pc_piix.c:159
#13 0x0000555555b20efa in pc_init_v6_0 (machine=0x5555568b9de0) at ../hw/i386/pc_piix.c:427
#14 0x0000555555a8c6e9 in machine_run_board_init (machine=0x5555568b9de0) at ../hw/core/machine.c:1232
#15 0x0000555555c77b73 in qemu_init_board () at ../softmmu/vl.c:2514
#16 0x0000555555c77d52 in qmp_x_exit_preconfig (errp=0x5555566e9a98 <error_fatal>) at ../softmmu/vl.c:2588
#17 0x0000555555c7a45f in qemu_init (argc=5, argv=0x7fffffffdd98, envp=0x7fffffffddc8) at ../softmmu/vl.c:3611
#18 0x0000555555818f05 in main (argc=5, argv=0x7fffffffdd98, envp=0x7fffffffddc8) at ../softmmu/main.c:49
关于为什么在 x86_cpu_new
之后会产生接下来的一系列调用之后再分析,这里先分析 x86_cpu_realizefn
。
首先是开始的几个变量声明
CPUState *cs = CPU(dev);
X86CPU *cpu = X86_CPU(dev);
X86CPUClass *xcc = X86_CPU_GET_CLASS(dev);
CPUX86State *env = &cpu->env;
它们的关系如下:
/ +---------+
| CPUState | env_ptr |--- 所有 CPU 都有的数据
| +---------+ |
| | | |
| +---------+<--
X86CPU | | |
| CPUX86State | env | x86cpu 特有的数据
| | |
| +---------+
| | | cpu 虚拟化需要的数据
\ +---------+
另外 X86CPUClass
是静态变量,其中的变量也是所有 X86 CPU 共有的。
接下来进一步分析 qemu_init_vcpu
,
void qemu_init_vcpu(CPUState *cpu)
{
MachineState *ms = MACHINE(qdev_get_machine());
cpu->nr_cores = ms->smp.cores; // 这些都很好理解
cpu->nr_threads = ms->smp.threads;
cpu->stopped = true;
cpu->random_seed = qemu_guest_random_seed_thread_part1();
if (!cpu->as) {
/* If the target cpu hasn't set up any address spaces itself,
* give it the default one.
*/
cpu->num_ases = 1;
cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory); // 这里只是创建变量,还没有初始化
}
/* accelerators all implement the AccelOpsClass */
g_assert(cpus_accel != NULL && cpus_accel->create_vcpu_thread != NULL);
cpus_accel->create_vcpu_thread(cpu);
while (!cpu->created) {
qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
}
}
首先记录 CPU 的核数和线程数,然后创建地址空间。如果使用 KVM 加速,就会调用 kvm_start_vcpu_thread
static void kvm_start_vcpu_thread(CPUState *cpu)
{
char thread_name[VCPU_THREAD_NAME_SIZE];
cpu->thread = g_malloc0(sizeof(QemuThread));
cpu->halt_cond = g_malloc0(sizeof(QemuCond));
qemu_cond_init(cpu->halt_cond);
snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
cpu->cpu_index);
qemu_thread_create(cpu->thread, thread_name, kvm_vcpu_thread_fn,
cpu, QEMU_THREAD_JOINABLE);
}
该函数通过 kvm_vcpu_thread_fn
创建 VCPU 线程并运行。
static void *kvm_vcpu_thread_fn(void *arg)
{
CPUState *cpu = arg;
int r;
rcu_register_thread();
qemu_mutex_lock_iothread();
qemu_thread_get_self(cpu->thread);
cpu->thread_id = qemu_get_thread_id();
cpu->can_do_io = 1;
current_cpu = cpu;
r = kvm_init_vcpu(cpu, &error_fatal);
kvm_init_cpu_signals(cpu);
/* signal CPU creation */
cpu_thread_signal_created(cpu);
qemu_guest_random_seed_thread_part2(cpu->random_seed);
do {
if (cpu_can_run(cpu)) {
r = kvm_cpu_exec(cpu); // 运行该 cpu,然后应用层就阻塞在这里
if (r == EXCP_DEBUG) {
cpu_handle_guest_debug(cpu);
}
}
qemu_wait_io_event(cpu); // 因为 QEMU 分为主线程和 I/O 线程,I/O 线程是用来处理设备读写的
} while (!cpu->unplug || cpu_can_run(cpu));
kvm_destroy_vcpu(cpu); // 退出循环就直接销毁
cpu_thread_signal_destroyed(cpu);
qemu_mutex_unlock_iothread();
rcu_unregister_thread();
return NULL;
}
这个函数首先调用 kvm_init_vcpu
,用于在 KVM 中创建 VCPU,这里 KVMState
是 QEMU 中用来表示 KVM 相关的数据结构,操作虚拟机的 vmfd 就保存在这里,注意这里有个容易搞混的地方,KVM 里是通过 kvm
结构来表示虚拟机的。
int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
KVMState *s = kvm_state;
long mmap_size;
int ret;
trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
...
cpu->kvm_fd = ret;
cpu->kvm_state = s;
cpu->vcpu_dirty = true;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
ret = mmap_size;
error_setg_errno(errp, -mmap_size,
"kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
goto err;
}
cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
cpu->kvm_fd, 0); // 将 KVM 中 kvm 结构映射到 QEMU 的地址空间中,数据共享
...
ret = kvm_arch_init_vcpu(cpu); // 构建 cpuid 然后传给 kvm?
...
err:
return ret;
}
kvm_get_vcpu
首先会从已有的 VCPU 队列中查询是否有对应 vcpu_id
的 VCPU ,如果没有的话就用 ioctl
向 KVM 发起一个 KVM_CREATE_VCPU
的请求,并把对应的 vcpu_id
传进去。
static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
{
struct KVMParkedVcpu *cpu;
QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
if (cpu->vcpu_id == vcpu_id) {
int kvm_fd;
QLIST_REMOVE(cpu, node);
kvm_fd = cpu->kvm_fd;
g_free(cpu);
return kvm_fd;
}
}
return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
}
kvm_vm_ioctl
底层就是一个 ioctl
系统调用,这个系统调用会被 KVM 捕获进行分析。
int kvm_vm_ioctl(KVMState *s, int type, ...)
{
int ret;
void *arg;
va_list ap;
va_start(ap, type);
arg = va_arg(ap, void *);
va_end(ap);
trace_kvm_vm_ioctl(type, arg);
ret = ioctl(s->vmfd, type, arg);
if (ret == -1) {
ret = -errno;
}
return ret;
}
然后调用 kvm_init_cpu_signals
初始化 CPU 的信号处理,使 CPU 线程能够处理 IPI 中断。
接下来的 do while
循环是最重要的,通过 cpu_can_run
判断 CPU 是否能够运行,如果可以则调用 kvm_cpu_exec
。该函数会调用 kvm_vcpu_ioctl
,即 KVM 提供的 ioctl(KVM_RUN)
,让 VCPU 在物理 CPU 上运行起来,然后应用层就阻塞在这里,当 guestOS 产生 VM Exit
时内核再根据退出原因进行处理,然后再循环运行,完成 CPU 的虚拟化。
KVM CPU 的创建
前面讲到 kvm_dev_ioctl_create_vm
会创建一个匿名的文件,用来表示一台虚拟机,并且返回 fd 到用户态, QEMU 通过这个文件描述符来操作虚拟机,即下面这行代码:
file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
该匿名文件的定义如下:
static struct file_operations kvm_vm_fops = {
.release = kvm_vm_release,
.unlocked_ioctl = kvm_vm_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_vm_compat_ioctl),
};
其主要操作也是 ioctl
,对应的函数是 kvm_vm_ioctl
,这个函数是处理虚拟机级别的 ioctl
入口,前面的 kvm_dev_ioctl
是设备级别的处理入口,不要搞混了。kvm_vm_ioctl
会处理各种请求,刚刚我们分析的 QEMU 中的 kvm_vm_ioctl
请求就是在这里进行处理(注意一个是 QEMU 中的函数,一个是 kernel KVM 中的函数,不要搞混了),这里先分析 KVM_CREATE_VCPU
即创建 VCPU 的情况。
static long kvm_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm *kvm = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
if (kvm->mm != current->mm || kvm->vm_bugged)
return -EIO;
switch (ioctl) {
case KVM_CREATE_VCPU:
r = kvm_vm_ioctl_create_vcpu(kvm, arg);
break;
...
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
out:
return r;
}
kvm_vm_ioctl_create_vcpu
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
{
int r;
struct kvm_vcpu *vcpu;
struct page *page;
...
kvm->created_vcpus++;
mutex_unlock(&kvm->lock);
r = kvm_arch_vcpu_precreate(kvm, id);
if (r)
goto vcpu_decrement;
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); // 分配结构体
if (!vcpu) {
r = -ENOMEM;
goto vcpu_decrement;
}
BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page) {
r = -ENOMEM;
goto vcpu_free;
}
vcpu->run = page_address(page); // 这个信息是运行时信息?和 QEMU 共享的么
kvm_vcpu_init(vcpu, kvm, id); // 初始化 vcpu 中的变量
r = kvm_arch_vcpu_create(vcpu); // 架构相关
if (r)
goto vcpu_free_run_page;
...
vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
/* Fill the stats id string for the vcpu */
snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
task_pid_nr(current), id);
/* Now it's all set up, let userspace reach it */
kvm_get_kvm(kvm);
r = create_vcpu_fd(vcpu);
if (r < 0) {
kvm_put_kvm_no_destroy(kvm);
goto unlock_vcpu_destroy;
}
kvm->vcpus[vcpu->vcpu_idx] = vcpu; // 将创建的 vcpu 保存到对应的虚拟机中
/*
* Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus
* before kvm->online_vcpu's incremented value.
*/
smp_wmb();
atomic_inc(&kvm->online_vcpus);
mutex_unlock(&kvm->lock);
kvm_arch_vcpu_postcreate(vcpu);
kvm_create_vcpu_debugfs(vcpu);
return r;
...
return r;
}
这里代码很清楚,kvm_vcpu_init
和 kvm_arch_vcpu_create
初始化 VCPU ,kvm_vcpu_init
就是初始化一些架构无关变量,最重要的应该是 vcpu->kvm = kvm;
和 vcpu->vcpu_id = id;
,kvm_arch_vcpu_create
进行进一步的初始化,这其中涉及到很多关于 CPU 的性质的初始化,不懂,有需要再进一步了解。
kvm_arch_vcpu_create
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
struct page *page;
int r;
...
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
// 初始化 mmu,如 pte_list 之类的
r = kvm_mmu_create(vcpu); // 这个需要深入分析
if (r < 0)
return r;
// lapic
if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
if (r < 0)
goto fail_mmu_destroy;
if (kvm_apicv_activated(vcpu->kvm))
vcpu->arch.apicv_active = true;
} else
static_branch_inc(&kvm_has_noapic_vcpu);
r = -ENOMEM;
page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!page)
goto fail_free_lapic;
vcpu->arch.pio_data = page_address(page);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks)
goto fail_free_pio_data;
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
...
// 这里会调用到回调函数 vmx_create_vcpu,这个函数很关键
r = static_call(kvm_x86_vcpu_create)(vcpu);
if (r)
goto free_guest_fpu;
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_set_tsc_khz(vcpu, max_tsc_khz);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu); // 内存虚拟化的初始化,需要进一步分析
vcpu_put(vcpu);
return 0;
...
return r;
}
除了做进一步初始化的工作,kvm_arch_vcpu_create
在 X86 下会调用 vmx_create_vcpu
,架构相关的数据都是在这里进行初始化,包括 VMCS 的设置。
vmx_create_vcpu
static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vmx_uret_msr *tsx_ctrl;
// vmx 的 VCPU 用该结构体表示,因为内核需要支持多种架构
// kvm_vcpu 相当于一个父类,子类继承
struct vcpu_vmx *vmx;
int i, cpu, err;
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu); // kvm_vcpu 表示的是通用层面的 vcpu
err = -ENOMEM;
// 每个 vcpu 与 vpid 相关联
// vpid 用于开启 ept 的情况,当进行 VCPU 的切换时就可以不用将 tlb 中的信息全部 flash
vmx->vpid = allocate_vpid();
...
for (i = 0; i < kvm_nr_uret_msrs; ++i) {
vmx->guest_uret_msrs[i].data = 0;
vmx->guest_uret_msrs[i].mask = -1ull;
}
if (boot_cpu_has(X86_FEATURE_RTM)) {
/*
* TSX_CTRL_CPUID_CLEAR is handledj in the CPUID interception.
* Keep the host value unchanged to avoid changing CPUID bits
* under the host kernel's feet.
*/
tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (tsx_ctrl)
tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
}
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
goto free_pml;
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
...
// loaded_vmcs 指向当前 VCPU 对应的 VMCS 区域, vmcs01 表示普通虚拟化,
// 如果是嵌套虚拟化,对应的是其他值。
vmx->loaded_vmcs = &vmx->vmcs01;
// 获取当前 cpu
cpu = get_cpu();
// 将 vcpu 与当前 cpu 绑定
vmx_vcpu_load(vcpu, cpu);
vcpu->cpu = cpu;
init_vmcs(vmx); // 这两个函数是从 guest 切换到 host 的关键函数,如果想进一步了解 VMCS 可以从这里入手
vmx_vcpu_put(vcpu);
put_cpu();
if (cpu_need_virtualize_apic_accesses(vcpu)) {
err = alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (enable_ept && !enable_unrestricted_guest) {
err = init_rmode_identity_map(vcpu->kvm);
if (err)
goto free_vmcs;
}
...
return 0;
...
return err;
}
完成初始化后将其保存到 kvm 中的 vcpus 结构中。
kvm->vcpus[vcpu->vcpu_idx] = vcpu;
QEMU 与 KVM 之间的共享数据
QEMU 和 KVM 之间经常需要共享数据,如 KVM 将 VM Exit 的信息放到共享内存中, QEMU 可以通过共享内存去获取这些数据。在 QEMU 通知 KVM 创建 VCPU 的起点函数 kvm_vcpu_thread_fn
中,会调用 kvm_init_vcpu
,其会查询 QEMU 的 VCPU 列表,看是否有符合条件的 VCPU,如果没有则使用 ioctl 通知 KVM 创建一个。然后调用 ioctl(KVM_GET_VCPU_MMAP_SIZE)
,该接口返回 KVM 和 QEMU 共享内存的大小,代码如下:
int kvm_init_vcpu(CPUState *cpu, Error **errp)
{
KVMState *s = kvm_state;
long mmap_size;
int ret;
trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
...
cpu->kvm_fd = ret;
cpu->kvm_state = s;
cpu->vcpu_dirty = true;
cpu->dirty_pages = 0;
// 共享内存大小
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
...
// kvm_fd: vCPU file descriptor for KVM.
cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
cpu->kvm_fd, 0);
...
return ret;
}
ioctl(KVM_GET_VCPU_MMAP_SIZE)
返回的页存储的内容如下:
case KVM_GET_VCPU_MMAP_SIZE:
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
获取到内存大小后,QEMU 调用 mmap 将 KVM 的中的虚拟机句柄 fd 映射到 cpu->kvm_run
和 cpu->kvm_dirty_gfns
。也就是说如果在 QEMU 访问 cpu->kvm_run
实际上访问的是 KVM 中的 kvm_run
信息。
VCPU CPUID 构造
kvm_init_vcpu
最后调用 kvm_arch_init_vcpu
完成 VCPU 架构相关的初始化,大部分工作是构造虚拟机 VCPU 的 CPUID。通过 CPUID 虚拟机可以获得 CPU 的型号,具体性能参数等信息。
QEMU 命令行中指定 CPU 类型及其增加的或去掉的 CPU 特性,QEMU 通过这些特性构造出一个 cpuid_data
,然后调用 VCPU 的ioctl(KVM_SET_CPUID2)
将构造的 CPUID 数据传到 KVM 中的 VCPU 相关的数据结构中,之后虚拟机内部执行 CPUID 指令会导致 VM Exit,然后陷入 KVM,KVM 会把数据返回给虚拟机。因为 CPUID 是特权指令,必须有 root 态下的内核进行处理。
VCPU 的运行
首先每个 VCPU 都会对应一个 VMCS,VMCS 对于 VCPU 的作用类似与进程描述符对于进程的作用。其用来管理 VMX non-root Operation 的转换以及控制 VCPU 的行为。操作 VMCS 的指令包括 VMCLEAR、VMPTRLD、VMREAD 和 VMWRITE。VMCS 的区域大小为 4KB,VMM 通过它的 64 位地址进行访问。
前面讲到 kvm_vcpu_thread_fn
创建 VCPU 线程并运行,并且跟踪了 QEMU 是怎样通过 kvm_init_vcpu
让 KVM 创建 VCPU 的,现在我们继续分析重要的 do while
循环。
do {
if (cpu_can_run(cpu)) {
r = kvm_cpu_exec(cpu);
if (r == EXCP_DEBUG) {
cpu_handle_guest_debug(cpu);
}
}
qemu_wait_io_event(cpu);
} while (!cpu->unplug || cpu_can_run(cpu));
调用 cpu_can_run
检查当前 cpu 是否可运行。这里有个问题,为什么用的是 CPUState
,而不是 X86CPU
,应该是 CPUState
中的数据就足够了。
bool cpu_can_run(CPUState *cpu)
{
if (cpu->stop) {
return false;
}
if (cpu_is_stopped(cpu)) {
return false;
}
return true;
}
如果不可运行,则调用 qemu_wait_io_event
将 cpu 挂在 cpu->halt_cond
条件上,带了全局锁 qemu_global_mutex
。
void qemu_wait_io_event(CPUState *cpu)
{
bool slept = false;
while (cpu_thread_is_idle(cpu)) {
if (!slept) {
slept = true;
qemu_plugin_vcpu_idle_cb(cpu);
}
qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
}
if (slept) {
qemu_plugin_vcpu_resume_cb(cpu);
}
qemu_wait_io_event_common(cpu);
}
当在 main 中执行 vm_start
-> resume_all_vcpus
-> qemu_cpu_kick
时,qemu_cpu_kick
会将 VCPU 唤醒。
void qemu_cpu_kick(CPUState *cpu)
{
qemu_cond_broadcast(cpu->halt_cond);
if (cpus_accel->kick_vcpu_thread) {
cpus_accel->kick_vcpu_thread(cpu);
} else { /* default */
cpus_kick_thread(cpu);
}
}
接下来分析 kvm_cpu_exec
,当然这个函数是 QEMU + KVM 的执行函数,tcg 的执行函数不一样。
int kvm_cpu_exec(CPUState *cpu)
{
struct kvm_run *run = cpu->kvm_run;
int ret, run_ret;
...
qemu_mutex_unlock_iothread();
cpu_exec_start(cpu);
do {
MemTxAttrs attrs;
if (cpu->vcpu_dirty) {
kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
cpu->vcpu_dirty = false;
}
kvm_arch_pre_run(cpu, run);
...
/* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
* Matching barrier in kvm_eat_signals.
*/
smp_rmb();
run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); // 在这里进入 kvm 执行,如果返回了,说明出现了 KVM 无法处理的
// 情况,需要返回到 QEMU 处理
attrs = kvm_arch_post_run(cpu, run);
...
trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); // exit_reason 是 KVM 写入的
switch (run->exit_reason) { // 这里处理的都是 KVM 无法处理的情况
case KVM_EXIT_IO:
DPRINTF("handle_io\n");
/* Called outside BQL */
kvm_handle_io(run->io.port, attrs,
(uint8_t *)run + run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;
case KVM_EXIT_MMIO:
DPRINTF("handle_mmio\n");
/* Called outside BQL */
address_space_rw(&address_space_memory,
run->mmio.phys_addr, attrs,
run->mmio.data,
run->mmio.len,
run->mmio.is_write);
ret = 0;
break;
case KVM_EXIT_IRQ_WINDOW_OPEN:
case KVM_EXIT_SHUTDOWN:
case KVM_EXIT_UNKNOWN:
case KVM_EXIT_INTERNAL_ERROR:
case KVM_EXIT_SYSTEM_EVENT:
switch (run->system_event.type) {
case KVM_SYSTEM_EVENT_SHUTDOWN:
case KVM_SYSTEM_EVENT_RESET:
case KVM_SYSTEM_EVENT_CRASH
default:
DPRINTF("kvm_arch_handle_exit\n");
ret = kvm_arch_handle_exit(cpu, run);
break;
}
break;
default:
DPRINTF("kvm_arch_handle_exit\n");
ret = kvm_arch_handle_exit(cpu, run);
break;
}
} while (ret == 0); // 循环执行,除非退出
cpu_exec_end(cpu);
qemu_mutex_lock_iothread();
if (ret < 0) {
cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
qatomic_set(&cpu->exit_request, 0);
return ret;
}
kvm_arch_pre_run
首先做一些运行前的准备工作,如 nmi
和 smi
的中断注入,之后使用 ioctl(KVM_RUN)
系统调用通知 KVM 使该 KVM 运行起来。KVM 模块在处理该 ioctl 时会执行对应的 vmx 指令,将 VCPU 运行的物理 CPU 从 VMX root
转换成 VMX non-root
,开始运行虚拟机中的代码。虚拟机内部如果产生 VM Exit
(哪些情况会导致 vm exit),就会退出到 KVM ,如果 KVM 无法处理就会分发到 QEMU ,也就是在 ioctl(KVM_RUN)
返回的时候调用 kvm_arch_post_run
进行初步的处理。然后根据共享内存 kvm_run
(这个是第 5 节的内容)中的数据来判断退出原因,并进行处理。之后的 switch
就是处理 KVM 不能处理的问题。
接下来分析 ioctl(KVM_RUN)
是怎样在 kernel 中运行的。这个 ioctl
是在 kvm_vcpu_ioctl
中处理的,这个函数是专门处理 VCPU ioctl
的,和之前的两个处理函数不同(一个是处理 dev ,一个是处理 vm )。
static long kvm_vcpu_ioctl (struct file *filp,
unsigned int ioctl, unsigned long arg)
{
struct kvm_vcpu *vcpu = filp->private_data;
void __user *argp = (void __user *)arg;
int r;
struct kvm_fpu *fpu = NULL;
struct kvm_sregs *kvm_sregs = NULL;
...
switch (ioctl) {
case KVM_RUN: {
struct pid *oldpid;
r = -EINVAL;
if (arg)
goto out;
oldpid = rcu_access_pointer(vcpu->pid);
if (unlikely(oldpid != task_pid(current))) {
/* The thread running this VCPU changed. */
struct pid *newpid;
r = kvm_arch_vcpu_run_pid_change(vcpu);
if (r)
break;
newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu->pid, newpid);
if (oldpid)
synchronize_rcu();
put_pid(oldpid);
}
r = kvm_arch_vcpu_ioctl_run(vcpu);
trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
break;
}
case KVM_GET_REGS:
case KVM_SET_REGS:
case KVM_GET_SREGS:
case KVM_SET_SREGS:
case KVM_SET_MP_STATE:
case KVM_SET_GUEST_DEBUG:
case KVM_SET_SIGNAL_MASK:
case KVM_GET_FPU:
case KVM_SET_FPU:
case KVM_GET_STATS_FD:
default:
r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
}
...
return r;
}
对于 KVM_RUN
的情况,会调用 kvm_arch_vcpu_ioctl_run
进行处理。
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
int r;
vcpu_load(vcpu); // 将 vcpu 中的 vmcs 数据加载到物理 cpu 中
kvm_sigset_activate(vcpu);
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
...
if (kvm_run->immediate_exit)
r = -EINTR;
else
r = vcpu_run(vcpu);
...j
vcpu_put(vcpu); // 恢复 host 现场
return r;
}
kvm_arch_vcpu_ioctl_run
主要调用 vcpu_run
,
static int vcpu_run(struct kvm_vcpu *vcpu)
{
int r;
struct kvm *kvm = vcpu->kvm;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
r = vcpu_block(kvm, vcpu);
}
if (r <= 0) // 根据返回值决定是否要返回 QEMU,如果 KVM 能够处理,就进入下一次循环
break;
kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
if (dm_request_for_irq_injection(vcpu) &&
kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
r = 0;
vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
++vcpu->stat.request_irq_exits;
break;
}
if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
r = xfer_to_guest_mode_handle_work(vcpu);
if (r)
return r;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
return r;
}
vcpu_run
和 QEMU 一样,首先通过调用 kvm_vcpu_running
判断当前 CPU 是否可运行
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
kvm_check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
kvm_vcpu_running
还要通过读取 vcpu 中的 hflags
和 HF_GUEST_MASK
判断当前 CPU 是否处于 guest mode,
static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
{
return vcpu->arch.hflags & HF_GUEST_MASK;
}
如果可以运行,就会调用 vcpu_enter_guest
进入虚拟机,在进入虚拟机前会通过 kvm_check_request
对 vcpu -> request
上的请求进行处理。这个函数是 enter guestos, run guestos, exit guestos 的重要函数,之后有需要还要再分析一下。
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
bool req_immediate_exit = false;
...
preempt_disable();
// 将 host 的状态保存到 vmcs 中
static_call(kvm_x86_prepare_guest_switch)(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
* IPI are then delayed after guest entry, which ensures that they
* result in virtual interrupt delivery.
*/
local_irq_disable();
vcpu->mode = IN_GUEST_MODE;
...
for (;;) {
// 这里的回调函数为 vmx_vcpu_run
exit_fastpath = static_call(kvm_x86_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
if (unlikely(kvm_vcpu_exit_request(vcpu))) { // 检查是否有需要注入 guest 的事件
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
break;
}
if (vcpu->arch.apicv_active)
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
}
...
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
static_call(kvm_x86_handle_exit_irqoff)(vcpu); // 先处理外部中断
/*
* Consume any pending interrupts, including the possible source of
* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
* An instruction is required after local_irq_enable() to fully unblock
* interrupts on processors that implement an interrupt shadow, the
* stat.exits increment will do nicely.
*/
kvm_before_interrupt(vcpu);
local_irq_enable();
++vcpu->stat.exits;
local_irq_disable();
kvm_after_interrupt(vcpu);
...
// 总的中断处理函数,在这里进行分发,下面分析,同时也给出了所有的中断处理函数
r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
return r; // r <= 0 就需要返回到用户态进行处理
...
}
将这些 request
处理完并 inject_pending_event
之后就调用 kvm_mmu_reload
处理内存虚拟化相关的东西。然后就是 kvm_x86_prepare_guest_switch
,保存宿主机状态到 VMCS
,使虚拟机下次退出后能正常运行。
之后就是 vmx 的 run 回调 —— vmx_vcpu_run
,这是一个巨长的函数,我没有搞懂,只关注目前有用的部分。
static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
...
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
vmx_vcpu_enter_exit(vcpu, vmx);
...
vmx_register_cache_reset(vcpu);
pt_guest_exit(vmx);
kvm_load_host_xsave_state(vcpu);
if (is_guest_mode(vcpu)) {
/*
* Track VMLAUNCH/VMRESUME that have made past guest state
* checking.
*/
if (vmx->nested.nested_run_pending &&
!vmx->exit_reason.failed_vmentry)
++vcpu->stat.nested_run;
vmx->nested.nested_run_pending = 0;
}
vmx->idt_vectoring_info = 0;
if (unlikely(vmx->fail)) {
vmx->exit_reason.full = 0xdead;
return EXIT_FASTPATH_NONE;
}
vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON); // 读取退出原因
if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
if (likely(!vmx->exit_reason.failed_vmentry))
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
if (unlikely(vmx->exit_reason.failed_vmentry))
return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
if (is_guest_mode(vcpu))
return EXIT_FASTPATH_NONE;
return vmx_exit_handlers_fastpath(vcpu);
}
vmx_vcpu_enter_exit
最终会 call vmx_vmenter
,在这个汇编里使用 vmlaunch
指令切换到 guest mode,
/**
* vmx_vmenter - VM-Enter the current loaded VMCS
*
* %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
*
* Returns:
* %RFLAGS.CF is set on VM-Fail Invalid
* %RFLAGS.ZF is set on VM-Fail Valid
* %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
*
* Note that VMRESUME/VMLAUNCH fall-through and return directly if
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
SYM_FUNC_START_LOCAL(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
1: vmresume
ret
2: vmlaunch
ret
3: cmpb $0, kvm_rebooting
je 4f
ret
4: ud2
_ASM_EXTABLE(1b, 3b)
_ASM_EXTABLE(2b, 3b)
SYM_FUNC_END(vmx_vmenter)
同时从 guest mode 中退出也是在 vmx_vcpu_run
中处理的,即使用 vmcs_read32
读取退出原因。
vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
当 vmx_vcpu_run
执行完返回时,已经完成了一次 VM Exit
和 VM Entry
。
退出之后同样回调函数 vmx_handle_exit
处理外部中断。
static struct kvm_x86_ops vmx_x86_ops __initdata = {
...
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
...
};
最后处理完中断后根据 vcpu_enter_guest
的返回值判断是否需要返回到 QEMU ,如果 vcpu_enter_guest
返回值小于等于 0 ,会导致退出循环,进而该 ioctl
返回到用户态 QEMU ,如果返回 1 ,则 KVM 能处理,继续下一轮执行。
我们进一步分析有哪些原因导致 guest 退出,同时是怎样处理的,VMX 的中断处理回调函数为 vmx_handle_exit
,
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
union vmx_exit_reason exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
* querying dirty_bitmap, we only need to kick all vcpus out of guest
* mode as if vcpus is in root mode, the PML buffer must has been
* flushed already. Note, PML is never enabled in hardware while
* running L2.
*/
if (enable_pml && !is_guest_mode(vcpu))
vmx_flush_pml_buffer(vcpu);
/*
* We should never reach this point with a pending nested VM-Enter, and
* more specifically emulation of L2 due to invalid guest state (see
* below) should never happen as that means we incorrectly allowed a
* nested VM-Enter with an invalid vmcs12.
*/
if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
return -EIO;
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
if (is_guest_mode(vcpu)) {
/*
* PML is never enabled when running L2, bail immediately if a
* PML full exit occurs as something is horribly wrong.
*/
if (exit_reason.basic == EXIT_REASON_PML_FULL)
goto unexpected_vmexit;
/*
* The host physical addresses of some pages of guest memory
* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
* Page). The CPU may write to these pages via their host
* physical address while L2 is running, bypassing any
* address-translation-based dirty tracking (e.g. EPT write
* protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
*/
nested_mark_vmcs12_pages_dirty(vcpu);
if (nested_vmx_reflect_vmexit(vcpu))
return 1;
}
if (exit_reason.failed_vmentry) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason.full;
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (unlikely(vmx->fail)) {
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
/*
* Note:
* Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
* delivery event since it indicates guest is accessing MMIO.
* The vm-exit can be triggered again after return to guest that
* will cause infinite loop.
*/
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
int ndata = 3;
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
vcpu->run->internal.ndata = ndata;
return 0;
}
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked)) {
if (!vmx_interrupt_blocked(vcpu)) {
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) {
/*
* This CPU don't support us in finding the end of an
* NMI-blocked window if the guest runs with IRQs
* disabled. So we pull the trigger after 1 s of
* futile waiting, but inform the user about this.
*/
printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
"state on VCPU %d after 1 s timeout\n",
__func__, vcpu->vcpu_id);
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
}
}
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
goto unexpected_vmexit;
#ifdef CONFIG_RETPOLINE
if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
return kvm_emulate_wrmsr(vcpu);
else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
return handle_preemption_timer(vcpu);
else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
return handle_interrupt_window(vcpu);
else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
return handle_external_interrupt(vcpu);
else if (exit_reason.basic == EXIT_REASON_HLT)
return kvm_emulate_halt(vcpu);
else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
return handle_ept_misconfig(vcpu);
#endif
exit_handler_index = array_index_nospec((u16)exit_reason.basic,
kvm_vmx_max_exit_handlers);
if (!kvm_vmx_exit_handlers[exit_handler_index])
goto unexpected_vmexit;
return kvm_vmx_exit_handlers[exit_handler_index](vcpu); // 为什么上面的要单独列出来?
unexpected_vmexit:
vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
exit_reason.full);
dump_vmcs(vcpu);
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
vcpu->run->internal.ndata = 2;
vcpu->run->internal.data[0] = exit_reason.full;
vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
return 0;
}
这里是所有的中断处理函数,
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
* to be done to userspace and return 0.
*/
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = kvm_emulate_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
[EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
[EXIT_REASON_VMPTRST] = handle_vmx_instruction,
[EXIT_REASON_VMREAD] = handle_vmx_instruction,
[EXIT_REASON_VMRESUME] = handle_vmx_instruction,
[EXIT_REASON_VMWRITE] = handle_vmx_instruction,
[EXIT_REASON_VMOFF] = handle_vmx_instruction,
[EXIT_REASON_VMON] = handle_vmx_instruction,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_APIC_WRITE] = handle_apic_write,
[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
[EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
[EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
[EXIT_REASON_GDTR_IDTR] = handle_desc,
[EXIT_REASON_LDTR_TR] = handle_desc,
[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
[EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
[EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
[EXIT_REASON_INVEPT] = handle_vmx_instruction,
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
[EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
[EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
};
如果 kvm_vcpu_running
判断出该 cpu 不能执行,那么就会调用 vcpu_block
,最终调用 schedule
请求调度,让出物理 cpu 。
我们再来看看 tcg 是怎样让 vcpu 运行的。
rr_start_vcpu_thread
/ mttcg_start_vcpu_thread
->
rr_cpu_thread_fn
(单线程)/ mttcg_cpu_thread_fn
(多线程)->
tcg_cpus_exec
->
cpu_exec
->
cpu_exec_enter
static void *mttcg_cpu_thread_fn(void *arg)
{
...
/* process any pending work */
cpu->exit_request = 1;
// 同样是一个 while 循环
do {
if (cpu_can_run(cpu)) {
int r;
qemu_mutex_unlock_iothread();
r = tcg_cpus_exec(cpu);
qemu_mutex_lock_iothread();
switch (r) {
case EXCP_DEBUG:
cpu_handle_guest_debug(cpu);
break;
case EXCP_HALTED:
/*
* during start-up the vCPU is reset and the thread is
* kicked several times. If we don't ensure we go back
* to sleep in the halted state we won't cleanly
* start-up when the vCPU is enabled.
*
* cpu->halted should ensure we sleep in wait_io_event
*/
g_assert(cpu->halted);
break;
case EXCP_ATOMIC:
qemu_mutex_unlock_iothread();
cpu_exec_step_atomic(cpu);
qemu_mutex_lock_iothread();
default:
/* Ignore everything else? */
break;
}
}
qatomic_mb_set(&cpu->exit_request, 0);
qemu_wait_io_event(cpu);
} while (!cpu->unplug || cpu_can_run(cpu));
tcg_cpus_destroy(cpu);
qemu_mutex_unlock_iothread();
rcu_remove_force_rcu_notifier(&force_rcu.notifier);
rcu_unregister_thread();
return NULL;
}
VCPU 的调度
虚拟机的每个 VCPU 都对应宿主机中的一个线程,通过宿主及内核调度器进行统一调度管理。如果不将虚拟机的 VCPU 线程绑定到物理 CPU 上,那么 VCPU 线程可能每次运行时被调度到不同的物理 CPU 上。每个物理 CPU 都有一个指向当前 VMCS 的指针——current_vmcs
。而 VCPU 调度的本质就是将物理 CPU 的 per_current 指向需要调度的 VCPU 的 VMCS。这里涉及到两个重要的函数:
vcpu_load
负责将 VCPU 状态加载到物理 CPU 上,vcpu_put
负责将当前物理 CPU 上运行的 VCPU 的 VMCS 调度出去并保存。
/*
* Switches to specified vcpu, until a matching vcpu_put()
*/
void vcpu_load(struct kvm_vcpu *vcpu)
{
int cpu = get_cpu(); // 获取当前 CPU ID
__this_cpu_write(kvm_running_vcpu, vcpu);
preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu); // 不同架构的加载函数,将 VMCS 加载到 cpu 中,X86 对应的是 vmx_vcpu_load
put_cpu(); // 开启抢占
}
EXPORT_SYMBOL_GPL(vcpu_load);
void vcpu_put(struct kvm_vcpu *vcpu)
{
preempt_disable();
kvm_arch_vcpu_put(vcpu);
preempt_notifier_unregister(&vcpu->preempt_notifier);
__this_cpu_write(kvm_running_vcpu, NULL);
preempt_enable();
}
EXPORT_SYMBOL_GPL(vcpu_put);
kvm_arch_vcpu_ioctl_run
是 ioctl(KVM_RUN)
的处理函数,它在函数开始和结束时会调用 vcpu_load
和 vcpu_put
。
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
int r;
// here
vcpu_load(vcpu);
kvm_sigset_activate(vcpu);
kvm_run->flags = 0;
kvm_load_guest_fpu(vcpu);
...
if (kvm_run->immediate_exit)
r = -EINTR;
else // cpu run
r = vcpu_run(vcpu);
...
// here
vcpu_put(vcpu);
return r;
}