Description:
I'm following the tutorial on hypervisor development. After day 4 of the series I cannot make my code run. Right after __vmx_vmlaunch
instruction gets executed successfully, the virtual machine I'm testing the hypervisor on, reboots. I believe this is caused by some incorrect settings of VMCS Host State Area (Chapter 24.5). There is no BSOD, crash nor error message in WinDbg.
I see two ways to approach this problem. One is to somehow extract more information from WinDbg. And the second one would require someone to spot what stupid thing I'm doing or missing in the vmcs initialization.
Unfortunately the code in tutorial is incomplete. I've spend some fair time to make sure that everything that tutorial covers is the same in my code. I will highlight parts which where added by me.
I'm really sorry for such an amount of code. Again let me emphasise that I believe the problem lies within the HOST
fields (as the crash inside guest vm shouldn't crash the host).
int init_vmcs(struct __vcpu_t* vcpu)
{
log_entry("init_vmcs()\n");
// Determinate exact size which is implementation specific.
// We expect it to be 4KB, but it is not guaranteed.
union __vmx_basic_msr_t vmx_basic_msr = { 0 };
vmx_basic_msr.control = __readmsr(IA32_VMX_BASIC);
if (vmx_basic_msr.bits.vmxon_region_size != sizeof(struct __vmcs_t)) {
log_error("Non standard vmcs region size: %llx. Support not yet implemented.\n",
vmx_basic_msr.bits.vmxon_region_size);
log_exit("init_vmcs()\n");
return -1;
}
PHYSICAL_ADDRESS physical_max;
physical_max.QuadPart = MAXULONG64;
vcpu->vmcs = MmAllocateContiguousMemory(sizeof(struct __vmcs_t), physical_max);
if (!vcpu->vmcs) {
log_error("Failed to allocate vcpu->vmcs(MmAllocateContiguousMemory failed).\n");
log_exit("init_vmcs()\n");
return -1;
}
RtlSecureZeroMemory(vcpu->vmcs, sizeof(struct __vmcs_t));
vcpu->vmcs_physical = MmGetPhysicalAddress(vcpu->vmcs).QuadPart;
// Discover VMCS revision identifier that a processor uses by reading the
// VMX capability MSR IA32_VMX_BASIC.
vcpu->vmcs->header.bits.revision_identifier = (unsigned int)vmx_basic_msr.bits.vmcs_revision_identifier;
vcpu->vmcs->header.bits.shadow_vmcs_indicator = 0;
// Before loading vmcs we invoke vmclear to flush data which might be cached by processor.
if (__vmx_vmclear(&vcpu->vmcs_physical) || __vmx_vmptrld(&vcpu->vmcs_physical)) {
log_error("Failed to flush data or load vmcs. (__vmx_vmclear or __vmx_vmptrld failed).\n");
log_exit("init_vmcs()\n");
return -1;
}
// Initialize VMCS Guest State Area.
if (__vmx_vmwrite(GUEST_CR0, __readcr0()) ||
__vmx_vmwrite(GUEST_CR3, __readcr3()) ||
__vmx_vmwrite(GUEST_CR4, __readcr4()) ||
__vmx_vmwrite(GUEST_DR7, __readdr(7)) ||
__vmx_vmwrite(GUEST_RSP, vcpu->guest_rsp) ||
__vmx_vmwrite(GUEST_RIP, vcpu->guest_rip) ||
__vmx_vmwrite(GUEST_RFLAGS, __readeflags()) ||
__vmx_vmwrite(GUEST_DEBUG_CONTROL, __readmsr(IA32_DEBUGCTL)) ||
__vmx_vmwrite(GUEST_SYSENTER_ESP, __readmsr(IA32_SYSENTER_ESP)) ||
__vmx_vmwrite(GUEST_SYSENTER_EIP, __readmsr(IA32_SYSENTER_EIP)) ||
__vmx_vmwrite(GUEST_SYSENTER_CS, __readmsr(IA32_SYSENTER_CS)) ||
__vmx_vmwrite(GUEST_VMCS_LINK_POINTER, ~0ULL) ||
__vmx_vmwrite(GUEST_FS_BASE, __readmsr(IA32_FS_BASE)) ||
__vmx_vmwrite(GUEST_GS_BASE, __readmsr(IA32_GS_BASE))
) {
log_error("Failed to set guest state. (__vmx_vmwrite failed).\n");
log_exit("init_vmcs()\n");
return -1;
}
if (__vmx_vmwrite(CR0_READ_SHADOW, __readcr0()) ||
__vmx_vmwrite(CR4_READ_SHADOW, __readcr4())
) {
log_error("Failed to set cr0_read_shadow or cr4_read_shadow. (__vmx_vmwrite failed).\n");
log_exit("init_vmcs()\n");
return -1;
}
union __vmx_entry_control_t entry_controls = { 0 };
entry_controls.bits.ia32e_mode_guest = 1;
vmx_adjust_entry_controls(&entry_controls);
__vmx_vmwrite(VM_ENTRY_CONTROLS, entry_controls.control);
union __vmx_exit_control_t exit_controls = { 0 };
exit_controls.bits.host_address_space_size = 1;
vmx_adjust_exit_controls(&exit_controls);
__vmx_vmwrite(VM_EXIT_CONTROLS, exit_controls.control);
union __vmx_pinbased_control_msr_t pinbased_controls = { 0 };
vmx_adjust_pinbased_controls(&pinbased_controls);
__vmx_vmwrite(PIN_BASED_VM_EXECUTION_CONTROLS, pinbased_controls.control);
union __vmx_primary_processor_based_control_t primary_controls = { 0 };
primary_controls.bits.use_msr_bitmaps = 1;
primary_controls.bits.active_secondary_controls = 1;
vmx_adjust_primary_processor_based_controls(&primary_controls);
__vmx_vmwrite(PRIMARY_PROCESSOR_BASED_VM_EXECUTION_CONTROLS, primary_controls.control);
union __vmx_secondary_processor_based_control_t secondary_controls = { 0 };
secondary_controls.bits.enable_rdtscp = 1;
secondary_controls.bits.enable_xsave_xrstor = 1;
secondary_controls.bits.enable_invpcid = 1;
vmx_adjust_secondary_processor_based_controls(&secondary_controls);
__vmx_vmwrite(SECONDARY_PROCESSOR_BASED_VM_EXECUTION_CONTROLS, secondary_controls.control);
__vmx_vmwrite(GUEST_CS_SELECTOR, __read_cs());
__vmx_vmwrite(GUEST_SS_SELECTOR, __read_ss());
__vmx_vmwrite(GUEST_DS_SELECTOR, __read_ds());
__vmx_vmwrite(GUEST_ES_SELECTOR, __read_es());
__vmx_vmwrite(GUEST_FS_SELECTOR, __read_fs());
__vmx_vmwrite(GUEST_GS_SELECTOR, __read_gs());
__vmx_vmwrite(GUEST_LDTR_SELECTOR, __read_ldtr());
__vmx_vmwrite(GUEST_TR_SELECTOR, __read_tr());
__vmx_vmwrite(GUEST_CS_LIMIT, __segmentlimit(__read_cs()));
__vmx_vmwrite(GUEST_SS_LIMIT, __segmentlimit(__read_ss()));
__vmx_vmwrite(GUEST_DS_LIMIT, __segmentlimit(__read_ds()));
__vmx_vmwrite(GUEST_ES_LIMIT, __segmentlimit(__read_es()));
__vmx_vmwrite(GUEST_FS_LIMIT, __segmentlimit(__read_fs()));
__vmx_vmwrite(GUEST_GS_LIMIT, __segmentlimit(__read_gs()));
__vmx_vmwrite(GUEST_LDTR_LIMIT, __segmentlimit(__read_ldtr()));
__vmx_vmwrite(GUEST_TR_LIMIT, __segmentlimit(__read_tr()));
struct __pseudo_descriptor_64_t gdtr;
struct __pseudo_descriptor_64_t idtr;
_sgdt(&gdtr);
__sidt(&idtr);
__vmx_vmwrite(GUEST_GDTR_BASE, gdtr.base_address);
__vmx_vmwrite(GUEST_GDTR_LIMIT, gdtr.limit);
__vmx_vmwrite(GUEST_IDTR_BASE, idtr.base_address);
__vmx_vmwrite(GUEST_IDTR_LIMIT, idtr.limit);
__vmx_vmwrite(GUEST_CS_BASE, get_segment_base(gdtr.base_address, __read_cs()));
__vmx_vmwrite(GUEST_DS_BASE, get_segment_base(gdtr.base_address, __read_ds()));
__vmx_vmwrite(GUEST_SS_BASE, get_segment_base(gdtr.base_address, __read_ss()));
__vmx_vmwrite(GUEST_ES_BASE, get_segment_base(gdtr.base_address, __read_es()));
__vmx_vmwrite(GUEST_CS_ACCESS_RIGHTS, read_segment_access_rights(__read_cs()));
__vmx_vmwrite(GUEST_SS_ACCESS_RIGHTS, read_segment_access_rights(__read_ss()));
__vmx_vmwrite(GUEST_DS_ACCESS_RIGHTS, read_segment_access_rights(__read_ds()));
__vmx_vmwrite(GUEST_ES_ACCESS_RIGHTS, read_segment_access_rights(__read_es()));
__vmx_vmwrite(GUEST_FS_ACCESS_RIGHTS, read_segment_access_rights(__read_fs()));
__vmx_vmwrite(GUEST_GS_ACCESS_RIGHTS, read_segment_access_rights(__read_gs()));
__vmx_vmwrite(GUEST_LDTR_ACCESS_RIGHTS, read_segment_access_rights(__read_ldtr()));
__vmx_vmwrite(GUEST_TR_ACCESS_RIGHTS, read_segment_access_rights(__read_tr()));
__vmx_vmwrite(GUEST_LDTR_BASE, get_segment_base(gdtr.base_address, __read_ldtr()));
__vmx_vmwrite(GUEST_TR_BASE, get_segment_base(gdtr.base_address, __read_tr()));
// Initialize VMCS Host State Area.
__vmx_vmwrite(HOST_CR0, __readcr0()); // Added by me
__vmx_vmwrite(HOST_CR3, __readcr3()); // Added by me
__vmx_vmwrite(HOST_CR4, __readcr4()); // Added by me
// Fields RPL and TI in host selector fields must be cleared.
unsigned short host_selector_mask = 7;
__vmx_vmwrite(HOST_CS_SELECTOR, __read_cs() & ~host_selector_mask);
__vmx_vmwrite(HOST_SS_SELECTOR, __read_ss() & ~host_selector_mask);
__vmx_vmwrite(HOST_DS_SELECTOR, __read_ds() & ~host_selector_mask);
__vmx_vmwrite(HOST_ES_SELECTOR, __read_es() & ~host_selector_mask);
__vmx_vmwrite(HOST_FS_SELECTOR, __read_fs() & ~host_selector_mask);
__vmx_vmwrite(HOST_GS_SELECTOR, __read_gs() & ~host_selector_mask);
__vmx_vmwrite(HOST_TR_SELECTOR, __read_tr() & ~host_selector_mask);
__vmx_vmwrite(HOST_TR_BASE, get_segment_base(gdtr.base_address, __read_tr()));
__vmx_vmwrite(HOST_GDTR_BASE, gdtr.base_address);
__vmx_vmwrite(HOST_IDTR_BASE, idtr.base_address);
unsigned __int64 vmm_stack = (unsigned __int64)vcpu->vmm_context->stack + VMM_STACK_SIZE;
if (__vmx_vmwrite(HOST_RSP, vmm_stack) ||
__vmx_vmwrite(HOST_RIP, vmm_entrypoint)
) {
log_error("Failed to set host_rsp, host_rip. (__vmx_vmwrite failed).\n");
log_exit("init_vmcs()\n");
return -1;
}
log_exit("init_vmcs()\n");
return 0;
}
void init_logical_processor(struct __vmm_context_t *vmm_context, void *guest_rsp)
{
log_entry("init_logical_processor()\n");
unsigned long cur_processor_number = KeGetCurrentProcessorNumber();
struct __vcpu_t* vcpu = vmm_context->vcpu_table[cur_processor_number];
log_debug("vcpu: %llx, guest_rsp: %llx\n", cur_processor_number, guest_rsp);
vcpu->guest_rsp = guest_rsp;
vcpu->guest_rip = (void*) guest_entry_stub;
adjust_control_registers();
if (enable_vmx_operation() != 0) {
log_error("Failed to enable_vmx_operation.\n");
goto _end;
}
if (!vm_has_cpuid_support()) {
log_error("VMX operation is not supported by the processor.\n");
goto _end;
}
log_success("VMX operation is supported by the processor.\n");
if (init_vmxon(vcpu)) {
log_error("Failed to initialize vmxon region.\n");
goto _end;
}
log_success("Initialized vmxon region.\n");
unsigned char vmxon_res = __vmx_on(&vcpu->vmxon_physical);
if (vmxon_res != 0) {
log_error("Failed to put vcpu into VMX operation. Error code: %d\n", vmxon_res);
goto _end;
}
log_success("vmx_on succeeded.\n");
if (init_vmcs(vcpu)) {
log_error("Failed to initialize vmcs.\n");
goto _end;
}
log_success("Initialized vmcs.\n");
unsigned char vmlaunch_res = vmxlaunch(); // just a wrapper over __vmx_vmlaunch
if (vmlaunch_res != 0) {
goto _end;
}
_end:
log_exit("init_logical_processor()\n");
}
vmm_entrypoint proc
int 3 ; addded by me
vmm_entrypoint endp
guest_entry_stub proc
mov rax, 1337h
hlt
guest_entry_stub endp
Update
I've read again the Intel manual section regarding VM-entry checks and found out that my init_vmcs
function wasn't setting HOST_FS_BASE
, HOST_GS_BASE
. After adding these fields it finally worked and trapped inside vmm_entrypoint
.
However I would love to hear some solution on how to debug the unexpected shutdowns.