The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/intel/vmx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2011 NetApp, Inc.
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  *
   26  * $FreeBSD$
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include <sys/param.h>
   33 #include <sys/systm.h>
   34 #include <sys/smp.h>
   35 #include <sys/kernel.h>
   36 #include <sys/malloc.h>
   37 #include <sys/pcpu.h>
   38 #include <sys/proc.h>
   39 #include <sys/sysctl.h>
   40 
   41 #include <vm/vm.h>
   42 #include <vm/pmap.h>
   43 
   44 #include <machine/psl.h>
   45 #include <machine/cpufunc.h>
   46 #include <machine/md_var.h>
   47 #include <machine/segments.h>
   48 #include <machine/smp.h>
   49 #include <machine/specialreg.h>
   50 #include <machine/vmparam.h>
   51 
   52 #include <machine/vmm.h>
   53 #include <machine/vmm_dev.h>
   54 #include <machine/vmm_instruction_emul.h>
   55 #include "vmm_lapic.h"
   56 #include "vmm_host.h"
   57 #include "vmm_ioport.h"
   58 #include "vmm_ipi.h"
   59 #include "vmm_ktr.h"
   60 #include "vmm_stat.h"
   61 #include "vatpic.h"
   62 #include "vlapic.h"
   63 #include "vlapic_priv.h"
   64 
   65 #include "ept.h"
   66 #include "vmx_cpufunc.h"
   67 #include "vmx.h"
   68 #include "vmx_msr.h"
   69 #include "x86.h"
   70 #include "vmx_controls.h"
   71 
   72 #define PINBASED_CTLS_ONE_SETTING                                       \
   73         (PINBASED_EXTINT_EXITING        |                               \
   74          PINBASED_NMI_EXITING           |                               \
   75          PINBASED_VIRTUAL_NMI)
   76 #define PINBASED_CTLS_ZERO_SETTING      0
   77 
   78 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
   79         (PROCBASED_INT_WINDOW_EXITING   |                               \
   80          PROCBASED_NMI_WINDOW_EXITING)
   81 
   82 #define PROCBASED_CTLS_ONE_SETTING                                      \
   83         (PROCBASED_SECONDARY_CONTROLS   |                               \
   84          PROCBASED_MWAIT_EXITING        |                               \
   85          PROCBASED_MONITOR_EXITING      |                               \
   86          PROCBASED_IO_EXITING           |                               \
   87          PROCBASED_MSR_BITMAPS          |                               \
   88          PROCBASED_CTLS_WINDOW_SETTING  |                               \
   89          PROCBASED_CR8_LOAD_EXITING     |                               \
   90          PROCBASED_CR8_STORE_EXITING)
   91 #define PROCBASED_CTLS_ZERO_SETTING     \
   92         (PROCBASED_CR3_LOAD_EXITING |   \
   93         PROCBASED_CR3_STORE_EXITING |   \
   94         PROCBASED_IO_BITMAPS)
   95 
   96 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
   97 #define PROCBASED_CTLS2_ZERO_SETTING    0
   98 
   99 #define VM_EXIT_CTLS_ONE_SETTING                                        \
  100         (VM_EXIT_HOST_LMA                       |                       \
  101         VM_EXIT_SAVE_EFER                       |                       \
  102         VM_EXIT_LOAD_EFER                       |                       \
  103         VM_EXIT_ACKNOWLEDGE_INTERRUPT)
  104 
  105 #define VM_EXIT_CTLS_ZERO_SETTING       VM_EXIT_SAVE_DEBUG_CONTROLS
  106 
  107 #define VM_ENTRY_CTLS_ONE_SETTING       (VM_ENTRY_LOAD_EFER)
  108 
  109 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
  110         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
  111         VM_ENTRY_INTO_SMM                       |                       \
  112         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
  113 
  114 #define HANDLED         1
  115 #define UNHANDLED       0
  116 
  117 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
  118 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
  119 
  120 SYSCTL_DECL(_hw_vmm);
  121 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
  122 
  123 int vmxon_enabled[MAXCPU];
  124 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
  125 
  126 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
  127 static uint32_t exit_ctls, entry_ctls;
  128 
  129 static uint64_t cr0_ones_mask, cr0_zeros_mask;
  130 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
  131              &cr0_ones_mask, 0, NULL);
  132 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
  133              &cr0_zeros_mask, 0, NULL);
  134 
  135 static uint64_t cr4_ones_mask, cr4_zeros_mask;
  136 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
  137              &cr4_ones_mask, 0, NULL);
  138 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
  139              &cr4_zeros_mask, 0, NULL);
  140 
  141 static int vmx_initialized;
  142 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
  143            &vmx_initialized, 0, "Intel VMX initialized");
  144 
  145 /*
  146  * Optional capabilities
  147  */
  148 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
  149 
  150 static int cap_halt_exit;
  151 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
  152     "HLT triggers a VM-exit");
  153 
  154 static int cap_pause_exit;
  155 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
  156     0, "PAUSE triggers a VM-exit");
  157 
  158 static int cap_unrestricted_guest;
  159 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
  160     &cap_unrestricted_guest, 0, "Unrestricted guests");
  161 
  162 static int cap_monitor_trap;
  163 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
  164     &cap_monitor_trap, 0, "Monitor trap flag");
  165 
  166 static int cap_invpcid;
  167 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
  168     0, "Guests are allowed to use INVPCID");
  169 
  170 static int virtual_interrupt_delivery;
  171 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
  172     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
  173 
  174 static int posted_interrupts;
  175 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
  176     &posted_interrupts, 0, "APICv posted interrupt support");
  177 
  178 static int pirvec;
  179 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
  180     &pirvec, 0, "APICv posted interrupt vector");
  181 
  182 static struct unrhdr *vpid_unr;
  183 static u_int vpid_alloc_failed;
  184 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
  185             &vpid_alloc_failed, 0, NULL);
  186 
  187 /*
  188  * Use the last page below 4GB as the APIC access address. This address is
  189  * occupied by the boot firmware so it is guaranteed that it will not conflict
  190  * with a page in system memory.
  191  */
  192 #define APIC_ACCESS_ADDRESS     0xFFFFF000
  193 
  194 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
  195 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
  196 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
  197 static void vmx_inject_pir(struct vlapic *vlapic);
  198 
  199 #ifdef KTR
  200 static const char *
  201 exit_reason_to_str(int reason)
  202 {
  203         static char reasonbuf[32];
  204 
  205         switch (reason) {
  206         case EXIT_REASON_EXCEPTION:
  207                 return "exception";
  208         case EXIT_REASON_EXT_INTR:
  209                 return "extint";
  210         case EXIT_REASON_TRIPLE_FAULT:
  211                 return "triplefault";
  212         case EXIT_REASON_INIT:
  213                 return "init";
  214         case EXIT_REASON_SIPI:
  215                 return "sipi";
  216         case EXIT_REASON_IO_SMI:
  217                 return "iosmi";
  218         case EXIT_REASON_SMI:
  219                 return "smi";
  220         case EXIT_REASON_INTR_WINDOW:
  221                 return "intrwindow";
  222         case EXIT_REASON_NMI_WINDOW:
  223                 return "nmiwindow";
  224         case EXIT_REASON_TASK_SWITCH:
  225                 return "taskswitch";
  226         case EXIT_REASON_CPUID:
  227                 return "cpuid";
  228         case EXIT_REASON_GETSEC:
  229                 return "getsec";
  230         case EXIT_REASON_HLT:
  231                 return "hlt";
  232         case EXIT_REASON_INVD:
  233                 return "invd";
  234         case EXIT_REASON_INVLPG:
  235                 return "invlpg";
  236         case EXIT_REASON_RDPMC:
  237                 return "rdpmc";
  238         case EXIT_REASON_RDTSC:
  239                 return "rdtsc";
  240         case EXIT_REASON_RSM:
  241                 return "rsm";
  242         case EXIT_REASON_VMCALL:
  243                 return "vmcall";
  244         case EXIT_REASON_VMCLEAR:
  245                 return "vmclear";
  246         case EXIT_REASON_VMLAUNCH:
  247                 return "vmlaunch";
  248         case EXIT_REASON_VMPTRLD:
  249                 return "vmptrld";
  250         case EXIT_REASON_VMPTRST:
  251                 return "vmptrst";
  252         case EXIT_REASON_VMREAD:
  253                 return "vmread";
  254         case EXIT_REASON_VMRESUME:
  255                 return "vmresume";
  256         case EXIT_REASON_VMWRITE:
  257                 return "vmwrite";
  258         case EXIT_REASON_VMXOFF:
  259                 return "vmxoff";
  260         case EXIT_REASON_VMXON:
  261                 return "vmxon";
  262         case EXIT_REASON_CR_ACCESS:
  263                 return "craccess";
  264         case EXIT_REASON_DR_ACCESS:
  265                 return "draccess";
  266         case EXIT_REASON_INOUT:
  267                 return "inout";
  268         case EXIT_REASON_RDMSR:
  269                 return "rdmsr";
  270         case EXIT_REASON_WRMSR:
  271                 return "wrmsr";
  272         case EXIT_REASON_INVAL_VMCS:
  273                 return "invalvmcs";
  274         case EXIT_REASON_INVAL_MSR:
  275                 return "invalmsr";
  276         case EXIT_REASON_MWAIT:
  277                 return "mwait";
  278         case EXIT_REASON_MTF:
  279                 return "mtf";
  280         case EXIT_REASON_MONITOR:
  281                 return "monitor";
  282         case EXIT_REASON_PAUSE:
  283                 return "pause";
  284         case EXIT_REASON_MCE_DURING_ENTRY:
  285                 return "mce-during-entry";
  286         case EXIT_REASON_TPR:
  287                 return "tpr";
  288         case EXIT_REASON_APIC_ACCESS:
  289                 return "apic-access";
  290         case EXIT_REASON_GDTR_IDTR:
  291                 return "gdtridtr";
  292         case EXIT_REASON_LDTR_TR:
  293                 return "ldtrtr";
  294         case EXIT_REASON_EPT_FAULT:
  295                 return "eptfault";
  296         case EXIT_REASON_EPT_MISCONFIG:
  297                 return "eptmisconfig";
  298         case EXIT_REASON_INVEPT:
  299                 return "invept";
  300         case EXIT_REASON_RDTSCP:
  301                 return "rdtscp";
  302         case EXIT_REASON_VMX_PREEMPT:
  303                 return "vmxpreempt";
  304         case EXIT_REASON_INVVPID:
  305                 return "invvpid";
  306         case EXIT_REASON_WBINVD:
  307                 return "wbinvd";
  308         case EXIT_REASON_XSETBV:
  309                 return "xsetbv";
  310         case EXIT_REASON_APIC_WRITE:
  311                 return "apic-write";
  312         default:
  313                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
  314                 return (reasonbuf);
  315         }
  316 }
  317 #endif  /* KTR */
  318 
  319 static int
  320 vmx_allow_x2apic_msrs(struct vmx *vmx)
  321 {
  322         int i, error;
  323 
  324         error = 0;
  325 
  326         /*
  327          * Allow readonly access to the following x2APIC MSRs from the guest.
  328          */
  329         error += guest_msr_ro(vmx, MSR_APIC_ID);
  330         error += guest_msr_ro(vmx, MSR_APIC_VERSION);
  331         error += guest_msr_ro(vmx, MSR_APIC_LDR);
  332         error += guest_msr_ro(vmx, MSR_APIC_SVR);
  333 
  334         for (i = 0; i < 8; i++)
  335                 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
  336 
  337         for (i = 0; i < 8; i++)
  338                 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
  339         
  340         for (i = 0; i < 8; i++)
  341                 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
  342 
  343         error += guest_msr_ro(vmx, MSR_APIC_ESR);
  344         error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
  345         error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
  346         error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
  347         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
  348         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
  349         error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
  350         error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
  351         error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
  352         error += guest_msr_ro(vmx, MSR_APIC_ICR);
  353 
  354         /*
  355          * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
  356          *
  357          * These registers get special treatment described in the section
  358          * "Virtualizing MSR-Based APIC Accesses".
  359          */
  360         error += guest_msr_rw(vmx, MSR_APIC_TPR);
  361         error += guest_msr_rw(vmx, MSR_APIC_EOI);
  362         error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
  363 
  364         return (error);
  365 }
  366 
  367 u_long
  368 vmx_fix_cr0(u_long cr0)
  369 {
  370 
  371         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
  372 }
  373 
  374 u_long
  375 vmx_fix_cr4(u_long cr4)
  376 {
  377 
  378         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
  379 }
  380 
  381 static void
  382 vpid_free(int vpid)
  383 {
  384         if (vpid < 0 || vpid > 0xffff)
  385                 panic("vpid_free: invalid vpid %d", vpid);
  386 
  387         /*
  388          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
  389          * the unit number allocator.
  390          */
  391 
  392         if (vpid > VM_MAXCPU)
  393                 free_unr(vpid_unr, vpid);
  394 }
  395 
  396 static void
  397 vpid_alloc(uint16_t *vpid, int num)
  398 {
  399         int i, x;
  400 
  401         if (num <= 0 || num > VM_MAXCPU)
  402                 panic("invalid number of vpids requested: %d", num);
  403 
  404         /*
  405          * If the "enable vpid" execution control is not enabled then the
  406          * VPID is required to be 0 for all vcpus.
  407          */
  408         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
  409                 for (i = 0; i < num; i++)
  410                         vpid[i] = 0;
  411                 return;
  412         }
  413 
  414         /*
  415          * Allocate a unique VPID for each vcpu from the unit number allocator.
  416          */
  417         for (i = 0; i < num; i++) {
  418                 x = alloc_unr(vpid_unr);
  419                 if (x == -1)
  420                         break;
  421                 else
  422                         vpid[i] = x;
  423         }
  424 
  425         if (i < num) {
  426                 atomic_add_int(&vpid_alloc_failed, 1);
  427 
  428                 /*
  429                  * If the unit number allocator does not have enough unique
  430                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
  431                  *
  432                  * These VPIDs are not be unique across VMs but this does not
  433                  * affect correctness because the combined mappings are also
  434                  * tagged with the EP4TA which is unique for each VM.
  435                  *
  436                  * It is still sub-optimal because the invvpid will invalidate
  437                  * combined mappings for a particular VPID across all EP4TAs.
  438                  */
  439                 while (i-- > 0)
  440                         vpid_free(vpid[i]);
  441 
  442                 for (i = 0; i < num; i++)
  443                         vpid[i] = i + 1;
  444         }
  445 }
  446 
  447 static void
  448 vpid_init(void)
  449 {
  450         /*
  451          * VPID 0 is required when the "enable VPID" execution control is
  452          * disabled.
  453          *
  454          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
  455          * unit number allocator does not have sufficient unique VPIDs to
  456          * satisfy the allocation.
  457          *
  458          * The remaining VPIDs are managed by the unit number allocator.
  459          */
  460         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
  461 }
  462 
  463 static void
  464 vmx_disable(void *arg __unused)
  465 {
  466         struct invvpid_desc invvpid_desc = { 0 };
  467         struct invept_desc invept_desc = { 0 };
  468 
  469         if (vmxon_enabled[curcpu]) {
  470                 /*
  471                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
  472                  *
  473                  * VMXON or VMXOFF are not required to invalidate any TLB
  474                  * caching structures. This prevents potential retention of
  475                  * cached information in the TLB between distinct VMX episodes.
  476                  */
  477                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
  478                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
  479                 vmxoff();
  480         }
  481         load_cr4(rcr4() & ~CR4_VMXE);
  482 }
  483 
  484 static int
  485 vmx_cleanup(void)
  486 {
  487         
  488         if (pirvec != 0)
  489                 vmm_ipi_free(pirvec);
  490 
  491         if (vpid_unr != NULL) {
  492                 delete_unrhdr(vpid_unr);
  493                 vpid_unr = NULL;
  494         }
  495 
  496         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
  497 
  498         return (0);
  499 }
  500 
  501 static void
  502 vmx_enable(void *arg __unused)
  503 {
  504         int error;
  505         uint64_t feature_control;
  506 
  507         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
  508         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
  509             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
  510                 wrmsr(MSR_IA32_FEATURE_CONTROL,
  511                     feature_control | IA32_FEATURE_CONTROL_VMX_EN |
  512                     IA32_FEATURE_CONTROL_LOCK);
  513         }
  514 
  515         load_cr4(rcr4() | CR4_VMXE);
  516 
  517         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
  518         error = vmxon(vmxon_region[curcpu]);
  519         if (error == 0)
  520                 vmxon_enabled[curcpu] = 1;
  521 }
  522 
  523 static void
  524 vmx_restore(void)
  525 {
  526 
  527         if (vmxon_enabled[curcpu])
  528                 vmxon(vmxon_region[curcpu]);
  529 }
  530 
  531 static int
  532 vmx_init(int ipinum)
  533 {
  534         int error, use_tpr_shadow;
  535         uint64_t basic, fixed0, fixed1, feature_control;
  536         uint32_t tmp, procbased2_vid_bits;
  537 
  538         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
  539         if (!(cpu_feature2 & CPUID2_VMX)) {
  540                 printf("vmx_init: processor does not support VMX operation\n");
  541                 return (ENXIO);
  542         }
  543 
  544         /*
  545          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
  546          * are set (bits 0 and 2 respectively).
  547          */
  548         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
  549         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
  550             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
  551                 printf("vmx_init: VMX operation disabled by BIOS\n");
  552                 return (ENXIO);
  553         }
  554 
  555         /*
  556          * Verify capabilities MSR_VMX_BASIC:
  557          * - bit 54 indicates support for INS/OUTS decoding
  558          */
  559         basic = rdmsr(MSR_VMX_BASIC);
  560         if ((basic & (1UL << 54)) == 0) {
  561                 printf("vmx_init: processor does not support desired basic "
  562                     "capabilities\n");
  563                 return (EINVAL);
  564         }
  565 
  566         /* Check support for primary processor-based VM-execution controls */
  567         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  568                                MSR_VMX_TRUE_PROCBASED_CTLS,
  569                                PROCBASED_CTLS_ONE_SETTING,
  570                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
  571         if (error) {
  572                 printf("vmx_init: processor does not support desired primary "
  573                        "processor-based controls\n");
  574                 return (error);
  575         }
  576 
  577         /* Clear the processor-based ctl bits that are set on demand */
  578         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
  579 
  580         /* Check support for secondary processor-based VM-execution controls */
  581         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  582                                MSR_VMX_PROCBASED_CTLS2,
  583                                PROCBASED_CTLS2_ONE_SETTING,
  584                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
  585         if (error) {
  586                 printf("vmx_init: processor does not support desired secondary "
  587                        "processor-based controls\n");
  588                 return (error);
  589         }
  590 
  591         /* Check support for VPID */
  592         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
  593                                PROCBASED2_ENABLE_VPID, 0, &tmp);
  594         if (error == 0)
  595                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
  596 
  597         /* Check support for pin-based VM-execution controls */
  598         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
  599                                MSR_VMX_TRUE_PINBASED_CTLS,
  600                                PINBASED_CTLS_ONE_SETTING,
  601                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
  602         if (error) {
  603                 printf("vmx_init: processor does not support desired "
  604                        "pin-based controls\n");
  605                 return (error);
  606         }
  607 
  608         /* Check support for VM-exit controls */
  609         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
  610                                VM_EXIT_CTLS_ONE_SETTING,
  611                                VM_EXIT_CTLS_ZERO_SETTING,
  612                                &exit_ctls);
  613         if (error) {
  614                 printf("vmx_init: processor does not support desired "
  615                     "exit controls\n");
  616                 return (error);
  617         }
  618 
  619         /* Check support for VM-entry controls */
  620         error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
  621             VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
  622             &entry_ctls);
  623         if (error) {
  624                 printf("vmx_init: processor does not support desired "
  625                     "entry controls\n");
  626                 return (error);
  627         }
  628 
  629         /*
  630          * Check support for optional features by testing them
  631          * as individual bits
  632          */
  633         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  634                                         MSR_VMX_TRUE_PROCBASED_CTLS,
  635                                         PROCBASED_HLT_EXITING, 0,
  636                                         &tmp) == 0);
  637 
  638         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  639                                         MSR_VMX_PROCBASED_CTLS,
  640                                         PROCBASED_MTF, 0,
  641                                         &tmp) == 0);
  642 
  643         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  644                                          MSR_VMX_TRUE_PROCBASED_CTLS,
  645                                          PROCBASED_PAUSE_EXITING, 0,
  646                                          &tmp) == 0);
  647 
  648         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  649                                         MSR_VMX_PROCBASED_CTLS2,
  650                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
  651                                         &tmp) == 0);
  652 
  653         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  654             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
  655             &tmp) == 0);
  656 
  657         /*
  658          * Check support for virtual interrupt delivery.
  659          */
  660         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
  661             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
  662             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
  663             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
  664 
  665         use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  666             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
  667             &tmp) == 0);
  668 
  669         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
  670             procbased2_vid_bits, 0, &tmp);
  671         if (error == 0 && use_tpr_shadow) {
  672                 virtual_interrupt_delivery = 1;
  673                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
  674                     &virtual_interrupt_delivery);
  675         }
  676 
  677         if (virtual_interrupt_delivery) {
  678                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
  679                 procbased_ctls2 |= procbased2_vid_bits;
  680                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
  681 
  682                 /*
  683                  * No need to emulate accesses to %CR8 if virtual
  684                  * interrupt delivery is enabled.
  685                  */
  686                 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
  687                 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
  688 
  689                 /*
  690                  * Check for Posted Interrupts only if Virtual Interrupt
  691                  * Delivery is enabled.
  692                  */
  693                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
  694                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
  695                     &tmp);
  696                 if (error == 0) {
  697                         pirvec = vmm_ipi_alloc();
  698                         if (pirvec == 0) {
  699                                 if (bootverbose) {
  700                                         printf("vmx_init: unable to allocate "
  701                                             "posted interrupt vector\n");
  702                                 }
  703                         } else {
  704                                 posted_interrupts = 1;
  705                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
  706                                     &posted_interrupts);
  707                         }
  708                 }
  709         }
  710 
  711         if (posted_interrupts)
  712                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
  713 
  714         /* Initialize EPT */
  715         error = ept_init(ipinum);
  716         if (error) {
  717                 printf("vmx_init: ept initialization failed (%d)\n", error);
  718                 return (error);
  719         }
  720 
  721         /*
  722          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
  723          */
  724         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
  725         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
  726         cr0_ones_mask = fixed0 & fixed1;
  727         cr0_zeros_mask = ~fixed0 & ~fixed1;
  728 
  729         /*
  730          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
  731          * if unrestricted guest execution is allowed.
  732          */
  733         if (cap_unrestricted_guest)
  734                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
  735 
  736         /*
  737          * Do not allow the guest to set CR0_NW or CR0_CD.
  738          */
  739         cr0_zeros_mask |= (CR0_NW | CR0_CD);
  740 
  741         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
  742         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
  743         cr4_ones_mask = fixed0 & fixed1;
  744         cr4_zeros_mask = ~fixed0 & ~fixed1;
  745 
  746         vpid_init();
  747 
  748         vmx_msr_init();
  749 
  750         /* enable VMX operation */
  751         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
  752 
  753         vmx_initialized = 1;
  754 
  755         return (0);
  756 }
  757 
  758 static void
  759 vmx_trigger_hostintr(int vector)
  760 {
  761         uintptr_t func;
  762         struct gate_descriptor *gd;
  763 
  764         gd = &idt[vector];
  765 
  766         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
  767             "invalid vector %d", vector));
  768         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
  769             vector));
  770         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
  771             "has invalid type %d", vector, gd->gd_type));
  772         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
  773             "has invalid dpl %d", vector, gd->gd_dpl));
  774         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
  775             "for vector %d has invalid selector %d", vector, gd->gd_selector));
  776         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
  777             "IST %d", vector, gd->gd_ist));
  778 
  779         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
  780         vmx_call_isr(func);
  781 }
  782 
  783 static int
  784 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
  785 {
  786         int error, mask_ident, shadow_ident;
  787         uint64_t mask_value;
  788 
  789         if (which != 0 && which != 4)
  790                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
  791 
  792         if (which == 0) {
  793                 mask_ident = VMCS_CR0_MASK;
  794                 mask_value = cr0_ones_mask | cr0_zeros_mask;
  795                 shadow_ident = VMCS_CR0_SHADOW;
  796         } else {
  797                 mask_ident = VMCS_CR4_MASK;
  798                 mask_value = cr4_ones_mask | cr4_zeros_mask;
  799                 shadow_ident = VMCS_CR4_SHADOW;
  800         }
  801 
  802         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
  803         if (error)
  804                 return (error);
  805 
  806         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
  807         if (error)
  808                 return (error);
  809 
  810         return (0);
  811 }
  812 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
  813 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
  814 
  815 static void *
  816 vmx_vminit(struct vm *vm, pmap_t pmap)
  817 {
  818         uint16_t vpid[VM_MAXCPU];
  819         int i, error;
  820         struct vmx *vmx;
  821         struct vmcs *vmcs;
  822         uint32_t exc_bitmap;
  823 
  824         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
  825         if ((uintptr_t)vmx & PAGE_MASK) {
  826                 panic("malloc of struct vmx not aligned on %d byte boundary",
  827                       PAGE_SIZE);
  828         }
  829         vmx->vm = vm;
  830 
  831         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
  832 
  833         /*
  834          * Clean up EPTP-tagged guest physical and combined mappings
  835          *
  836          * VMX transitions are not required to invalidate any guest physical
  837          * mappings. So, it may be possible for stale guest physical mappings
  838          * to be present in the processor TLBs.
  839          *
  840          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
  841          */
  842         ept_invalidate_mappings(vmx->eptp);
  843 
  844         msr_bitmap_initialize(vmx->msr_bitmap);
  845 
  846         /*
  847          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
  848          * The guest FSBASE and GSBASE are saved and restored during
  849          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
  850          * always restored from the vmcs host state area on vm-exit.
  851          *
  852          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
  853          * how they are saved/restored so can be directly accessed by the
  854          * guest.
  855          *
  856          * MSR_EFER is saved and restored in the guest VMCS area on a
  857          * VM exit and entry respectively. It is also restored from the
  858          * host VMCS area on a VM exit.
  859          *
  860          * The TSC MSR is exposed read-only. Writes are disallowed as
  861          * that will impact the host TSC.  If the guest does a write
  862          * the "use TSC offsetting" execution control is enabled and the
  863          * difference between the host TSC and the guest TSC is written
  864          * into the TSC offset in the VMCS.
  865          */
  866         if (guest_msr_rw(vmx, MSR_GSBASE) ||
  867             guest_msr_rw(vmx, MSR_FSBASE) ||
  868             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
  869             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
  870             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
  871             guest_msr_rw(vmx, MSR_EFER) ||
  872             guest_msr_ro(vmx, MSR_TSC))
  873                 panic("vmx_vminit: error setting guest msr access");
  874 
  875         vpid_alloc(vpid, VM_MAXCPU);
  876 
  877         if (virtual_interrupt_delivery) {
  878                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
  879                     APIC_ACCESS_ADDRESS);
  880                 /* XXX this should really return an error to the caller */
  881                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
  882         }
  883 
  884         for (i = 0; i < VM_MAXCPU; i++) {
  885                 vmcs = &vmx->vmcs[i];
  886                 vmcs->identifier = vmx_revision();
  887                 error = vmclear(vmcs);
  888                 if (error != 0) {
  889                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
  890                               error, i);
  891                 }
  892 
  893                 vmx_msr_guest_init(vmx, i);
  894 
  895                 error = vmcs_init(vmcs);
  896                 KASSERT(error == 0, ("vmcs_init error %d", error));
  897 
  898                 VMPTRLD(vmcs);
  899                 error = 0;
  900                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
  901                 error += vmwrite(VMCS_EPTP, vmx->eptp);
  902                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
  903                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
  904                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
  905                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
  906                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
  907                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
  908                 error += vmwrite(VMCS_VPID, vpid[i]);
  909 
  910                 /* exception bitmap */
  911                 if (vcpu_trace_exceptions(vm, i))
  912                         exc_bitmap = 0xffffffff;
  913                 else
  914                         exc_bitmap = 1 << IDT_MC;
  915                 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
  916 
  917                 if (virtual_interrupt_delivery) {
  918                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
  919                         error += vmwrite(VMCS_VIRTUAL_APIC,
  920                             vtophys(&vmx->apic_page[i]));
  921                         error += vmwrite(VMCS_EOI_EXIT0, 0);
  922                         error += vmwrite(VMCS_EOI_EXIT1, 0);
  923                         error += vmwrite(VMCS_EOI_EXIT2, 0);
  924                         error += vmwrite(VMCS_EOI_EXIT3, 0);
  925                 }
  926                 if (posted_interrupts) {
  927                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
  928                         error += vmwrite(VMCS_PIR_DESC,
  929                             vtophys(&vmx->pir_desc[i]));
  930                 }
  931                 VMCLEAR(vmcs);
  932                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
  933 
  934                 vmx->cap[i].set = 0;
  935                 vmx->cap[i].proc_ctls = procbased_ctls;
  936                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
  937 
  938                 vmx->state[i].nextrip = ~0;
  939                 vmx->state[i].lastcpu = NOCPU;
  940                 vmx->state[i].vpid = vpid[i];
  941 
  942                 /*
  943                  * Set up the CR0/4 shadows, and init the read shadow
  944                  * to the power-on register value from the Intel Sys Arch.
  945                  *  CR0 - 0x60000010
  946                  *  CR4 - 0
  947                  */
  948                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
  949                 if (error != 0)
  950                         panic("vmx_setup_cr0_shadow %d", error);
  951 
  952                 error = vmx_setup_cr4_shadow(vmcs, 0);
  953                 if (error != 0)
  954                         panic("vmx_setup_cr4_shadow %d", error);
  955 
  956                 vmx->ctx[i].pmap = pmap;
  957         }
  958 
  959         return (vmx);
  960 }
  961 
  962 static int
  963 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
  964 {
  965         int handled, func;
  966         
  967         func = vmxctx->guest_rax;
  968 
  969         handled = x86_emulate_cpuid(vm, vcpu,
  970                                     (uint32_t*)(&vmxctx->guest_rax),
  971                                     (uint32_t*)(&vmxctx->guest_rbx),
  972                                     (uint32_t*)(&vmxctx->guest_rcx),
  973                                     (uint32_t*)(&vmxctx->guest_rdx));
  974         return (handled);
  975 }
  976 
  977 static __inline void
  978 vmx_run_trace(struct vmx *vmx, int vcpu)
  979 {
  980 #ifdef KTR
  981         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
  982 #endif
  983 }
  984 
  985 static __inline void
  986 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
  987                int handled)
  988 {
  989 #ifdef KTR
  990         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
  991                  handled ? "handled" : "unhandled",
  992                  exit_reason_to_str(exit_reason), rip);
  993 #endif
  994 }
  995 
  996 static __inline void
  997 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
  998 {
  999 #ifdef KTR
 1000         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 1001 #endif
 1002 }
 1003 
 1004 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 1005 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 1006 
 1007 /*
 1008  * Invalidate guest mappings identified by its vpid from the TLB.
 1009  */
 1010 static __inline void
 1011 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 1012 {
 1013         struct vmxstate *vmxstate;
 1014         struct invvpid_desc invvpid_desc;
 1015 
 1016         vmxstate = &vmx->state[vcpu];
 1017         if (vmxstate->vpid == 0)
 1018                 return;
 1019 
 1020         if (!running) {
 1021                 /*
 1022                  * Set the 'lastcpu' to an invalid host cpu.
 1023                  *
 1024                  * This will invalidate TLB entries tagged with the vcpu's
 1025                  * vpid the next time it runs via vmx_set_pcpu_defaults().
 1026                  */
 1027                 vmxstate->lastcpu = NOCPU;
 1028                 return;
 1029         }
 1030 
 1031         KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 1032             "critical section", __func__, vcpu));
 1033 
 1034         /*
 1035          * Invalidate all mappings tagged with 'vpid'
 1036          *
 1037          * We do this because this vcpu was executing on a different host
 1038          * cpu when it last ran. We do not track whether it invalidated
 1039          * mappings associated with its 'vpid' during that run. So we must
 1040          * assume that the mappings associated with 'vpid' on 'curcpu' are
 1041          * stale and invalidate them.
 1042          *
 1043          * Note that we incur this penalty only when the scheduler chooses to
 1044          * move the thread associated with this vcpu between host cpus.
 1045          *
 1046          * Note also that this will invalidate mappings tagged with 'vpid'
 1047          * for "all" EP4TAs.
 1048          */
 1049         if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 1050                 invvpid_desc._res1 = 0;
 1051                 invvpid_desc._res2 = 0;
 1052                 invvpid_desc.vpid = vmxstate->vpid;
 1053                 invvpid_desc.linear_addr = 0;
 1054                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 1055                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 1056         } else {
 1057                 /*
 1058                  * The invvpid can be skipped if an invept is going to
 1059                  * be performed before entering the guest. The invept
 1060                  * will invalidate combined mappings tagged with
 1061                  * 'vmx->eptp' for all vpids.
 1062                  */
 1063                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 1064         }
 1065 }
 1066 
 1067 static void
 1068 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 1069 {
 1070         struct vmxstate *vmxstate;
 1071 
 1072         vmxstate = &vmx->state[vcpu];
 1073         if (vmxstate->lastcpu == curcpu)
 1074                 return;
 1075 
 1076         vmxstate->lastcpu = curcpu;
 1077 
 1078         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 1079 
 1080         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 1081         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 1082         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 1083         vmx_invvpid(vmx, vcpu, pmap, 1);
 1084 }
 1085 
 1086 /*
 1087  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
 1088  */
 1089 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 1090 
 1091 static void __inline
 1092 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 1093 {
 1094 
 1095         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 1096                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 1097                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1098                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 1099         }
 1100 }
 1101 
 1102 static void __inline
 1103 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 1104 {
 1105 
 1106         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 1107             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 1108         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 1109         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1110         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 1111 }
 1112 
 1113 static void __inline
 1114 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 1115 {
 1116 
 1117         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 1118                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 1119                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1120                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 1121         }
 1122 }
 1123 
 1124 static void __inline
 1125 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 1126 {
 1127 
 1128         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 1129             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 1130         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 1131         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1132         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 1133 }
 1134 
 1135 int
 1136 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 1137 {
 1138         int error;
 1139 
 1140         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
 1141                 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
 1142                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1143                 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
 1144         }
 1145 
 1146         error = vmwrite(VMCS_TSC_OFFSET, offset);
 1147 
 1148         return (error);
 1149 }
 1150 
 1151 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
 1152                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 1153 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
 1154                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 1155 
 1156 static void
 1157 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 1158 {
 1159         uint32_t gi, info;
 1160 
 1161         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1162         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 1163             "interruptibility-state %#x", gi));
 1164 
 1165         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1166         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 1167             "VM-entry interruption information %#x", info));
 1168 
 1169         /*
 1170          * Inject the virtual NMI. The vector must be the NMI IDT entry
 1171          * or the VMCS entry check will fail.
 1172          */
 1173         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 1174         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1175 
 1176         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 1177 
 1178         /* Clear the request */
 1179         vm_nmi_clear(vmx->vm, vcpu);
 1180 }
 1181 
 1182 static void
 1183 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
 1184     uint64_t guestrip)
 1185 {
 1186         int vector, need_nmi_exiting, extint_pending;
 1187         uint64_t rflags, entryinfo;
 1188         uint32_t gi, info;
 1189 
 1190         if (vmx->state[vcpu].nextrip != guestrip) {
 1191                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1192                 if (gi & HWINTR_BLOCKING) {
 1193                         VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 1194                             "cleared due to rip change: %#lx/%#lx",
 1195                             vmx->state[vcpu].nextrip, guestrip);
 1196                         gi &= ~HWINTR_BLOCKING;
 1197                         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1198                 }
 1199         }
 1200 
 1201         if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 1202                 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 1203                     "intinfo is not valid: %#lx", __func__, entryinfo));
 1204 
 1205                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1206                 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 1207                      "pending exception: %#lx/%#x", __func__, entryinfo, info));
 1208 
 1209                 info = entryinfo;
 1210                 vector = info & 0xff;
 1211                 if (vector == IDT_BP || vector == IDT_OF) {
 1212                         /*
 1213                          * VT-x requires #BP and #OF to be injected as software
 1214                          * exceptions.
 1215                          */
 1216                         info &= ~VMCS_INTR_T_MASK;
 1217                         info |= VMCS_INTR_T_SWEXCEPTION;
 1218                 }
 1219 
 1220                 if (info & VMCS_INTR_DEL_ERRCODE)
 1221                         vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 1222 
 1223                 vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1224         }
 1225 
 1226         if (vm_nmi_pending(vmx->vm, vcpu)) {
 1227                 /*
 1228                  * If there are no conditions blocking NMI injection then
 1229                  * inject it directly here otherwise enable "NMI window
 1230                  * exiting" to inject it as soon as we can.
 1231                  *
 1232                  * We also check for STI_BLOCKING because some implementations
 1233                  * don't allow NMI injection in this case. If we are running
 1234                  * on a processor that doesn't have this restriction it will
 1235                  * immediately exit and the NMI will be injected in the
 1236                  * "NMI window exiting" handler.
 1237                  */
 1238                 need_nmi_exiting = 1;
 1239                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1240                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 1241                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1242                         if ((info & VMCS_INTR_VALID) == 0) {
 1243                                 vmx_inject_nmi(vmx, vcpu);
 1244                                 need_nmi_exiting = 0;
 1245                         } else {
 1246                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 1247                                     "due to VM-entry intr info %#x", info);
 1248                         }
 1249                 } else {
 1250                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 1251                             "Guest Interruptibility-state %#x", gi);
 1252                 }
 1253 
 1254                 if (need_nmi_exiting)
 1255                         vmx_set_nmi_window_exiting(vmx, vcpu);
 1256         }
 1257 
 1258         extint_pending = vm_extint_pending(vmx->vm, vcpu);
 1259 
 1260         if (!extint_pending && virtual_interrupt_delivery) {
 1261                 vmx_inject_pir(vlapic);
 1262                 return;
 1263         }
 1264 
 1265         /*
 1266          * If interrupt-window exiting is already in effect then don't bother
 1267          * checking for pending interrupts. This is just an optimization and
 1268          * not needed for correctness.
 1269          */
 1270         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 1271                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 1272                     "pending int_window_exiting");
 1273                 return;
 1274         }
 1275 
 1276         if (!extint_pending) {
 1277                 /* Ask the local apic for a vector to inject */
 1278                 if (!vlapic_pending_intr(vlapic, &vector))
 1279                         return;
 1280 
 1281                 /*
 1282                  * From the Intel SDM, Volume 3, Section "Maskable
 1283                  * Hardware Interrupts":
 1284                  * - maskable interrupt vectors [16,255] can be delivered
 1285                  *   through the local APIC.
 1286                 */
 1287                 KASSERT(vector >= 16 && vector <= 255,
 1288                     ("invalid vector %d from local APIC", vector));
 1289         } else {
 1290                 /* Ask the legacy pic for a vector to inject */
 1291                 vatpic_pending_intr(vmx->vm, &vector);
 1292 
 1293                 /*
 1294                  * From the Intel SDM, Volume 3, Section "Maskable
 1295                  * Hardware Interrupts":
 1296                  * - maskable interrupt vectors [0,255] can be delivered
 1297                  *   through the INTR pin.
 1298                  */
 1299                 KASSERT(vector >= 0 && vector <= 255,
 1300                     ("invalid vector %d from INTR", vector));
 1301         }
 1302 
 1303         /* Check RFLAGS.IF and the interruptibility state of the guest */
 1304         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 1305         if ((rflags & PSL_I) == 0) {
 1306                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1307                     "rflags %#lx", vector, rflags);
 1308                 goto cantinject;
 1309         }
 1310 
 1311         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1312         if (gi & HWINTR_BLOCKING) {
 1313                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1314                     "Guest Interruptibility-state %#x", vector, gi);
 1315                 goto cantinject;
 1316         }
 1317 
 1318         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1319         if (info & VMCS_INTR_VALID) {
 1320                 /*
 1321                  * This is expected and could happen for multiple reasons:
 1322                  * - A vectoring VM-entry was aborted due to astpending
 1323                  * - A VM-exit happened during event injection.
 1324                  * - An exception was injected above.
 1325                  * - An NMI was injected above or after "NMI window exiting"
 1326                  */
 1327                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1328                     "VM-entry intr info %#x", vector, info);
 1329                 goto cantinject;
 1330         }
 1331 
 1332         /* Inject the interrupt */
 1333         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 1334         info |= vector;
 1335         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1336 
 1337         if (!extint_pending) {
 1338                 /* Update the Local APIC ISR */
 1339                 vlapic_intr_accepted(vlapic, vector);
 1340         } else {
 1341                 vm_extint_clear(vmx->vm, vcpu);
 1342                 vatpic_intr_accepted(vmx->vm, vector);
 1343 
 1344                 /*
 1345                  * After we accepted the current ExtINT the PIC may
 1346                  * have posted another one.  If that is the case, set
 1347                  * the Interrupt Window Exiting execution control so
 1348                  * we can inject that one too.
 1349                  *
 1350                  * Also, interrupt window exiting allows us to inject any
 1351                  * pending APIC vector that was preempted by the ExtINT
 1352                  * as soon as possible. This applies both for the software
 1353                  * emulated vlapic and the hardware assisted virtual APIC.
 1354                  */
 1355                 vmx_set_int_window_exiting(vmx, vcpu);
 1356         }
 1357 
 1358         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 1359 
 1360         return;
 1361 
 1362 cantinject:
 1363         /*
 1364          * Set the Interrupt Window Exiting execution control so we can inject
 1365          * the interrupt as soon as blocking condition goes away.
 1366          */
 1367         vmx_set_int_window_exiting(vmx, vcpu);
 1368 }
 1369 
 1370 /*
 1371  * If the Virtual NMIs execution control is '1' then the logical processor
 1372  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
 1373  * the VMCS. An IRET instruction in VMX non-root operation will remove any
 1374  * virtual-NMI blocking.
 1375  *
 1376  * This unblocking occurs even if the IRET causes a fault. In this case the
 1377  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
 1378  */
 1379 static void
 1380 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 1381 {
 1382         uint32_t gi;
 1383 
 1384         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 1385         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1386         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 1387         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1388 }
 1389 
 1390 static void
 1391 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 1392 {
 1393         uint32_t gi;
 1394 
 1395         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 1396         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1397         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 1398         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1399 }
 1400 
 1401 static void
 1402 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 1403 {
 1404         uint32_t gi;
 1405 
 1406         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1407         KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 1408             ("NMI blocking is not in effect %#x", gi));
 1409 }
 1410 
 1411 static int
 1412 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 1413 {
 1414         struct vmxctx *vmxctx;
 1415         uint64_t xcrval;
 1416         const struct xsave_limits *limits;
 1417 
 1418         vmxctx = &vmx->ctx[vcpu];
 1419         limits = vmm_get_xsave_limits();
 1420 
 1421         /*
 1422          * Note that the processor raises a GP# fault on its own if
 1423          * xsetbv is executed for CPL != 0, so we do not have to
 1424          * emulate that fault here.
 1425          */
 1426 
 1427         /* Only xcr0 is supported. */
 1428         if (vmxctx->guest_rcx != 0) {
 1429                 vm_inject_gp(vmx->vm, vcpu);
 1430                 return (HANDLED);
 1431         }
 1432 
 1433         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 1434         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 1435                 vm_inject_ud(vmx->vm, vcpu);
 1436                 return (HANDLED);
 1437         }
 1438 
 1439         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 1440         if ((xcrval & ~limits->xcr0_allowed) != 0) {
 1441                 vm_inject_gp(vmx->vm, vcpu);
 1442                 return (HANDLED);
 1443         }
 1444 
 1445         if (!(xcrval & XFEATURE_ENABLED_X87)) {
 1446                 vm_inject_gp(vmx->vm, vcpu);
 1447                 return (HANDLED);
 1448         }
 1449 
 1450         /* AVX (YMM_Hi128) requires SSE. */
 1451         if (xcrval & XFEATURE_ENABLED_AVX &&
 1452             (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 1453                 vm_inject_gp(vmx->vm, vcpu);
 1454                 return (HANDLED);
 1455         }
 1456 
 1457         /*
 1458          * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 1459          * ZMM_Hi256, and Hi16_ZMM.
 1460          */
 1461         if (xcrval & XFEATURE_AVX512 &&
 1462             (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 1463             (XFEATURE_AVX512 | XFEATURE_AVX)) {
 1464                 vm_inject_gp(vmx->vm, vcpu);
 1465                 return (HANDLED);
 1466         }
 1467 
 1468         /*
 1469          * Intel MPX requires both bound register state flags to be
 1470          * set.
 1471          */
 1472         if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 1473             ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 1474                 vm_inject_gp(vmx->vm, vcpu);
 1475                 return (HANDLED);
 1476         }
 1477 
 1478         /*
 1479          * This runs "inside" vmrun() with the guest's FPU state, so
 1480          * modifying xcr0 directly modifies the guest's xcr0, not the
 1481          * host's.
 1482          */
 1483         load_xcr(0, xcrval);
 1484         return (HANDLED);
 1485 }
 1486 
 1487 static uint64_t
 1488 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 1489 {
 1490         const struct vmxctx *vmxctx;
 1491 
 1492         vmxctx = &vmx->ctx[vcpu];
 1493 
 1494         switch (ident) {
 1495         case 0:
 1496                 return (vmxctx->guest_rax);
 1497         case 1:
 1498                 return (vmxctx->guest_rcx);
 1499         case 2:
 1500                 return (vmxctx->guest_rdx);
 1501         case 3:
 1502                 return (vmxctx->guest_rbx);
 1503         case 4:
 1504                 return (vmcs_read(VMCS_GUEST_RSP));
 1505         case 5:
 1506                 return (vmxctx->guest_rbp);
 1507         case 6:
 1508                 return (vmxctx->guest_rsi);
 1509         case 7:
 1510                 return (vmxctx->guest_rdi);
 1511         case 8:
 1512                 return (vmxctx->guest_r8);
 1513         case 9:
 1514                 return (vmxctx->guest_r9);
 1515         case 10:
 1516                 return (vmxctx->guest_r10);
 1517         case 11:
 1518                 return (vmxctx->guest_r11);
 1519         case 12:
 1520                 return (vmxctx->guest_r12);
 1521         case 13:
 1522                 return (vmxctx->guest_r13);
 1523         case 14:
 1524                 return (vmxctx->guest_r14);
 1525         case 15:
 1526                 return (vmxctx->guest_r15);
 1527         default:
 1528                 panic("invalid vmx register %d", ident);
 1529         }
 1530 }
 1531 
 1532 static void
 1533 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 1534 {
 1535         struct vmxctx *vmxctx;
 1536 
 1537         vmxctx = &vmx->ctx[vcpu];
 1538 
 1539         switch (ident) {
 1540         case 0:
 1541                 vmxctx->guest_rax = regval;
 1542                 break;
 1543         case 1:
 1544                 vmxctx->guest_rcx = regval;
 1545                 break;
 1546         case 2:
 1547                 vmxctx->guest_rdx = regval;
 1548                 break;
 1549         case 3:
 1550                 vmxctx->guest_rbx = regval;
 1551                 break;
 1552         case 4:
 1553                 vmcs_write(VMCS_GUEST_RSP, regval);
 1554                 break;
 1555         case 5:
 1556                 vmxctx->guest_rbp = regval;
 1557                 break;
 1558         case 6:
 1559                 vmxctx->guest_rsi = regval;
 1560                 break;
 1561         case 7:
 1562                 vmxctx->guest_rdi = regval;
 1563                 break;
 1564         case 8:
 1565                 vmxctx->guest_r8 = regval;
 1566                 break;
 1567         case 9:
 1568                 vmxctx->guest_r9 = regval;
 1569                 break;
 1570         case 10:
 1571                 vmxctx->guest_r10 = regval;
 1572                 break;
 1573         case 11:
 1574                 vmxctx->guest_r11 = regval;
 1575                 break;
 1576         case 12:
 1577                 vmxctx->guest_r12 = regval;
 1578                 break;
 1579         case 13:
 1580                 vmxctx->guest_r13 = regval;
 1581                 break;
 1582         case 14:
 1583                 vmxctx->guest_r14 = regval;
 1584                 break;
 1585         case 15:
 1586                 vmxctx->guest_r15 = regval;
 1587                 break;
 1588         default:
 1589                 panic("invalid vmx register %d", ident);
 1590         }
 1591 }
 1592 
 1593 static int
 1594 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1595 {
 1596         uint64_t crval, regval;
 1597 
 1598         /* We only handle mov to %cr0 at this time */
 1599         if ((exitqual & 0xf0) != 0x00)
 1600                 return (UNHANDLED);
 1601 
 1602         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 1603 
 1604         vmcs_write(VMCS_CR0_SHADOW, regval);
 1605 
 1606         crval = regval | cr0_ones_mask;
 1607         crval &= ~cr0_zeros_mask;
 1608         vmcs_write(VMCS_GUEST_CR0, crval);
 1609 
 1610         if (regval & CR0_PG) {
 1611                 uint64_t efer, entry_ctls;
 1612 
 1613                 /*
 1614                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 1615                  * the "IA-32e mode guest" bit in VM-entry control must be
 1616                  * equal.
 1617                  */
 1618                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 1619                 if (efer & EFER_LME) {
 1620                         efer |= EFER_LMA;
 1621                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 1622                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 1623                         entry_ctls |= VM_ENTRY_GUEST_LMA;
 1624                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 1625                 }
 1626         }
 1627 
 1628         return (HANDLED);
 1629 }
 1630 
 1631 static int
 1632 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1633 {
 1634         uint64_t crval, regval;
 1635 
 1636         /* We only handle mov to %cr4 at this time */
 1637         if ((exitqual & 0xf0) != 0x00)
 1638                 return (UNHANDLED);
 1639 
 1640         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 1641 
 1642         vmcs_write(VMCS_CR4_SHADOW, regval);
 1643 
 1644         crval = regval | cr4_ones_mask;
 1645         crval &= ~cr4_zeros_mask;
 1646         vmcs_write(VMCS_GUEST_CR4, crval);
 1647 
 1648         return (HANDLED);
 1649 }
 1650 
 1651 static int
 1652 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1653 {
 1654         struct vlapic *vlapic;
 1655         uint64_t cr8;
 1656         int regnum;
 1657 
 1658         /* We only handle mov %cr8 to/from a register at this time. */
 1659         if ((exitqual & 0xe0) != 0x00) {
 1660                 return (UNHANDLED);
 1661         }
 1662 
 1663         vlapic = vm_lapic(vmx->vm, vcpu);
 1664         regnum = (exitqual >> 8) & 0xf;
 1665         if (exitqual & 0x10) {
 1666                 cr8 = vlapic_get_cr8(vlapic);
 1667                 vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 1668         } else {
 1669                 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 1670                 vlapic_set_cr8(vlapic, cr8);
 1671         }
 1672 
 1673         return (HANDLED);
 1674 }
 1675 
 1676 /*
 1677  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
 1678  */
 1679 static int
 1680 vmx_cpl(void)
 1681 {
 1682         uint32_t ssar;
 1683 
 1684         ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 1685         return ((ssar >> 5) & 0x3);
 1686 }
 1687 
 1688 static enum vm_cpu_mode
 1689 vmx_cpu_mode(void)
 1690 {
 1691         uint32_t csar;
 1692 
 1693         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 1694                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 1695                 if (csar & 0x2000)
 1696                         return (CPU_MODE_64BIT);        /* CS.L = 1 */
 1697                 else
 1698                         return (CPU_MODE_COMPATIBILITY);
 1699         } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 1700                 return (CPU_MODE_PROTECTED);
 1701         } else {
 1702                 return (CPU_MODE_REAL);
 1703         }
 1704 }
 1705 
 1706 static enum vm_paging_mode
 1707 vmx_paging_mode(void)
 1708 {
 1709 
 1710         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 1711                 return (PAGING_MODE_FLAT);
 1712         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 1713                 return (PAGING_MODE_32);
 1714         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 1715                 return (PAGING_MODE_64);
 1716         else
 1717                 return (PAGING_MODE_PAE);
 1718 }
 1719 
 1720 static uint64_t
 1721 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 1722 {
 1723         uint64_t val;
 1724         int error;
 1725         enum vm_reg_name reg;
 1726 
 1727         reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 1728         error = vmx_getreg(vmx, vcpuid, reg, &val);
 1729         KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 1730         return (val);
 1731 }
 1732 
 1733 static uint64_t
 1734 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 1735 {
 1736         uint64_t val;
 1737         int error;
 1738 
 1739         if (rep) {
 1740                 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 1741                 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 1742         } else {
 1743                 val = 1;
 1744         }
 1745         return (val);
 1746 }
 1747 
 1748 static int
 1749 inout_str_addrsize(uint32_t inst_info)
 1750 {
 1751         uint32_t size;
 1752 
 1753         size = (inst_info >> 7) & 0x7;
 1754         switch (size) {
 1755         case 0:
 1756                 return (2);     /* 16 bit */
 1757         case 1:
 1758                 return (4);     /* 32 bit */
 1759         case 2:
 1760                 return (8);     /* 64 bit */
 1761         default:
 1762                 panic("%s: invalid size encoding %d", __func__, size);
 1763         }
 1764 }
 1765 
 1766 static void
 1767 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
 1768     struct vm_inout_str *vis)
 1769 {
 1770         int error, s;
 1771 
 1772         if (in) {
 1773                 vis->seg_name = VM_REG_GUEST_ES;
 1774         } else {
 1775                 s = (inst_info >> 15) & 0x7;
 1776                 vis->seg_name = vm_segment_name(s);
 1777         }
 1778 
 1779         error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 1780         KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 1781 }
 1782 
 1783 static void
 1784 vmx_paging_info(struct vm_guest_paging *paging)
 1785 {
 1786         paging->cr3 = vmcs_guest_cr3();
 1787         paging->cpl = vmx_cpl();
 1788         paging->cpu_mode = vmx_cpu_mode();
 1789         paging->paging_mode = vmx_paging_mode();
 1790 }
 1791 
 1792 static void
 1793 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 1794 {
 1795         struct vm_guest_paging *paging;
 1796         uint32_t csar;
 1797 
 1798         paging = &vmexit->u.inst_emul.paging;
 1799 
 1800         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 1801         vmexit->inst_length = 0;
 1802         vmexit->u.inst_emul.gpa = gpa;
 1803         vmexit->u.inst_emul.gla = gla;
 1804         vmx_paging_info(paging);
 1805         switch (paging->cpu_mode) {
 1806         case CPU_MODE_REAL:
 1807                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 1808                 vmexit->u.inst_emul.cs_d = 0;
 1809                 break;
 1810         case CPU_MODE_PROTECTED:
 1811         case CPU_MODE_COMPATIBILITY:
 1812                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 1813                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 1814                 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 1815                 break;
 1816         default:
 1817                 vmexit->u.inst_emul.cs_base = 0;
 1818                 vmexit->u.inst_emul.cs_d = 0;
 1819                 break;
 1820         }
 1821         vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 1822 }
 1823 
 1824 static int
 1825 ept_fault_type(uint64_t ept_qual)
 1826 {
 1827         int fault_type;
 1828 
 1829         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 1830                 fault_type = VM_PROT_WRITE;
 1831         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 1832                 fault_type = VM_PROT_EXECUTE;
 1833         else
 1834                 fault_type= VM_PROT_READ;
 1835 
 1836         return (fault_type);
 1837 }
 1838 
 1839 static boolean_t
 1840 ept_emulation_fault(uint64_t ept_qual)
 1841 {
 1842         int read, write;
 1843 
 1844         /* EPT fault on an instruction fetch doesn't make sense here */
 1845         if (ept_qual & EPT_VIOLATION_INST_FETCH)
 1846                 return (FALSE);
 1847 
 1848         /* EPT fault must be a read fault or a write fault */
 1849         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 1850         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 1851         if ((read | write) == 0)
 1852                 return (FALSE);
 1853 
 1854         /*
 1855          * The EPT violation must have been caused by accessing a
 1856          * guest-physical address that is a translation of a guest-linear
 1857          * address.
 1858          */
 1859         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 1860             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 1861                 return (FALSE);
 1862         }
 1863 
 1864         return (TRUE);
 1865 }
 1866 
 1867 static __inline int
 1868 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 1869 {
 1870         uint32_t proc_ctls2;
 1871 
 1872         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 1873         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 1874 }
 1875 
 1876 static __inline int
 1877 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 1878 {
 1879         uint32_t proc_ctls2;
 1880 
 1881         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 1882         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 1883 }
 1884 
 1885 static int
 1886 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
 1887     uint64_t qual)
 1888 {
 1889         int error, handled, offset;
 1890         uint32_t *apic_regs, vector;
 1891         bool retu;
 1892 
 1893         handled = HANDLED;
 1894         offset = APIC_WRITE_OFFSET(qual);
 1895 
 1896         if (!apic_access_virtualization(vmx, vcpuid)) {
 1897                 /*
 1898                  * In general there should not be any APIC write VM-exits
 1899                  * unless APIC-access virtualization is enabled.
 1900                  *
 1901                  * However self-IPI virtualization can legitimately trigger
 1902                  * an APIC-write VM-exit so treat it specially.
 1903                  */
 1904                 if (x2apic_virtualization(vmx, vcpuid) &&
 1905                     offset == APIC_OFFSET_SELF_IPI) {
 1906                         apic_regs = (uint32_t *)(vlapic->apic_page);
 1907                         vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 1908                         vlapic_self_ipi_handler(vlapic, vector);
 1909                         return (HANDLED);
 1910                 } else
 1911                         return (UNHANDLED);
 1912         }
 1913 
 1914         switch (offset) {
 1915         case APIC_OFFSET_ID:
 1916                 vlapic_id_write_handler(vlapic);
 1917                 break;
 1918         case APIC_OFFSET_LDR:
 1919                 vlapic_ldr_write_handler(vlapic);
 1920                 break;
 1921         case APIC_OFFSET_DFR:
 1922                 vlapic_dfr_write_handler(vlapic);
 1923                 break;
 1924         case APIC_OFFSET_SVR:
 1925                 vlapic_svr_write_handler(vlapic);
 1926                 break;
 1927         case APIC_OFFSET_ESR:
 1928                 vlapic_esr_write_handler(vlapic);
 1929                 break;
 1930         case APIC_OFFSET_ICR_LOW:
 1931                 retu = false;
 1932                 error = vlapic_icrlo_write_handler(vlapic, &retu);
 1933                 if (error != 0 || retu)
 1934                         handled = UNHANDLED;
 1935                 break;
 1936         case APIC_OFFSET_CMCI_LVT:
 1937         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 1938                 vlapic_lvt_write_handler(vlapic, offset);
 1939                 break;
 1940         case APIC_OFFSET_TIMER_ICR:
 1941                 vlapic_icrtmr_write_handler(vlapic);
 1942                 break;
 1943         case APIC_OFFSET_TIMER_DCR:
 1944                 vlapic_dcr_write_handler(vlapic);
 1945                 break;
 1946         default:
 1947                 handled = UNHANDLED;
 1948                 break;
 1949         }
 1950         return (handled);
 1951 }
 1952 
 1953 static bool
 1954 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 1955 {
 1956 
 1957         if (apic_access_virtualization(vmx, vcpuid) &&
 1958             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 1959                 return (true);
 1960         else
 1961                 return (false);
 1962 }
 1963 
 1964 static int
 1965 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 1966 {
 1967         uint64_t qual;
 1968         int access_type, offset, allowed;
 1969 
 1970         if (!apic_access_virtualization(vmx, vcpuid))
 1971                 return (UNHANDLED);
 1972 
 1973         qual = vmexit->u.vmx.exit_qualification;
 1974         access_type = APIC_ACCESS_TYPE(qual);
 1975         offset = APIC_ACCESS_OFFSET(qual);
 1976 
 1977         allowed = 0;
 1978         if (access_type == 0) {
 1979                 /*
 1980                  * Read data access to the following registers is expected.
 1981                  */
 1982                 switch (offset) {
 1983                 case APIC_OFFSET_APR:
 1984                 case APIC_OFFSET_PPR:
 1985                 case APIC_OFFSET_RRR:
 1986                 case APIC_OFFSET_CMCI_LVT:
 1987                 case APIC_OFFSET_TIMER_CCR:
 1988                         allowed = 1;
 1989                         break;
 1990                 default:
 1991                         break;
 1992                 }
 1993         } else if (access_type == 1) {
 1994                 /*
 1995                  * Write data access to the following registers is expected.
 1996                  */
 1997                 switch (offset) {
 1998                 case APIC_OFFSET_VER:
 1999                 case APIC_OFFSET_APR:
 2000                 case APIC_OFFSET_PPR:
 2001                 case APIC_OFFSET_RRR:
 2002                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 2003                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 2004                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 2005                 case APIC_OFFSET_CMCI_LVT:
 2006                 case APIC_OFFSET_TIMER_CCR:
 2007                         allowed = 1;
 2008                         break;
 2009                 default:
 2010                         break;
 2011                 }
 2012         }
 2013 
 2014         if (allowed) {
 2015                 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 2016                     VIE_INVALID_GLA);
 2017         }
 2018 
 2019         /*
 2020          * Regardless of whether the APIC-access is allowed this handler
 2021          * always returns UNHANDLED:
 2022          * - if the access is allowed then it is handled by emulating the
 2023          *   instruction that caused the VM-exit (outside the critical section)
 2024          * - if the access is not allowed then it will be converted to an
 2025          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 2026          */
 2027         return (UNHANDLED);
 2028 }
 2029 
 2030 static enum task_switch_reason
 2031 vmx_task_switch_reason(uint64_t qual)
 2032 {
 2033         int reason;
 2034 
 2035         reason = (qual >> 30) & 0x3;
 2036         switch (reason) {
 2037         case 0:
 2038                 return (TSR_CALL);
 2039         case 1:
 2040                 return (TSR_IRET);
 2041         case 2:
 2042                 return (TSR_JMP);
 2043         case 3:
 2044                 return (TSR_IDT_GATE);
 2045         default:
 2046                 panic("%s: invalid reason %d", __func__, reason);
 2047         }
 2048 }
 2049 
 2050 static int
 2051 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 2052 {
 2053         int error;
 2054 
 2055         if (lapic_msr(num))
 2056                 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 2057         else
 2058                 error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 2059 
 2060         return (error);
 2061 }
 2062 
 2063 static int
 2064 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 2065 {
 2066         struct vmxctx *vmxctx;
 2067         uint64_t result;
 2068         uint32_t eax, edx;
 2069         int error;
 2070 
 2071         if (lapic_msr(num))
 2072                 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 2073         else
 2074                 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 2075 
 2076         if (error == 0) {
 2077                 eax = result;
 2078                 vmxctx = &vmx->ctx[vcpuid];
 2079                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 2080                 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 2081 
 2082                 edx = result >> 32;
 2083                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 2084                 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 2085         }
 2086 
 2087         return (error);
 2088 }
 2089 
 2090 static int
 2091 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 2092 {
 2093         int error, errcode, errcode_valid, handled, in;
 2094         struct vmxctx *vmxctx;
 2095         struct vlapic *vlapic;
 2096         struct vm_inout_str *vis;
 2097         struct vm_task_switch *ts;
 2098         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 2099         uint32_t intr_type, intr_vec, reason;
 2100         uint64_t exitintinfo, qual, gpa;
 2101         bool retu;
 2102 
 2103         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 2104         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 2105 
 2106         handled = UNHANDLED;
 2107         vmxctx = &vmx->ctx[vcpu];
 2108 
 2109         qual = vmexit->u.vmx.exit_qualification;
 2110         reason = vmexit->u.vmx.exit_reason;
 2111         vmexit->exitcode = VM_EXITCODE_BOGUS;
 2112 
 2113         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 2114 
 2115         /*
 2116          * VM-entry failures during or after loading guest state.
 2117          *
 2118          * These VM-exits are uncommon but must be handled specially
 2119          * as most VM-exit fields are not populated as usual.
 2120          */
 2121         if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 2122                 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 2123                 __asm __volatile("int $18");
 2124                 return (1);
 2125         }
 2126 
 2127         /*
 2128          * VM exits that can be triggered during event delivery need to
 2129          * be handled specially by re-injecting the event if the IDT
 2130          * vectoring information field's valid bit is set.
 2131          *
 2132          * See "Information for VM Exits During Event Delivery" in Intel SDM
 2133          * for details.
 2134          */
 2135         idtvec_info = vmcs_idt_vectoring_info();
 2136         if (idtvec_info & VMCS_IDT_VEC_VALID) {
 2137                 idtvec_info &= ~(1 << 12); /* clear undefined bit */
 2138                 exitintinfo = idtvec_info;
 2139                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 2140                         idtvec_err = vmcs_idt_vectoring_err();
 2141                         exitintinfo |= (uint64_t)idtvec_err << 32;
 2142                 }
 2143                 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 2144                 KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 2145                     __func__, error));
 2146 
 2147                 /*
 2148                  * If 'virtual NMIs' are being used and the VM-exit
 2149                  * happened while injecting an NMI during the previous
 2150                  * VM-entry, then clear "blocking by NMI" in the
 2151                  * Guest Interruptibility-State so the NMI can be
 2152                  * reinjected on the subsequent VM-entry.
 2153                  *
 2154                  * However, if the NMI was being delivered through a task
 2155                  * gate, then the new task must start execution with NMIs
 2156                  * blocked so don't clear NMI blocking in this case.
 2157                  */
 2158                 intr_type = idtvec_info & VMCS_INTR_T_MASK;
 2159                 if (intr_type == VMCS_INTR_T_NMI) {
 2160                         if (reason != EXIT_REASON_TASK_SWITCH)
 2161                                 vmx_clear_nmi_blocking(vmx, vcpu);
 2162                         else
 2163                                 vmx_assert_nmi_blocking(vmx, vcpu);
 2164                 }
 2165 
 2166                 /*
 2167                  * Update VM-entry instruction length if the event being
 2168                  * delivered was a software interrupt or software exception.
 2169                  */
 2170                 if (intr_type == VMCS_INTR_T_SWINTR ||
 2171                     intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 2172                     intr_type == VMCS_INTR_T_SWEXCEPTION) {
 2173                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 2174                 }
 2175         }
 2176 
 2177         switch (reason) {
 2178         case EXIT_REASON_TASK_SWITCH:
 2179                 ts = &vmexit->u.task_switch;
 2180                 ts->tsssel = qual & 0xffff;
 2181                 ts->reason = vmx_task_switch_reason(qual);
 2182                 ts->ext = 0;
 2183                 ts->errcode_valid = 0;
 2184                 vmx_paging_info(&ts->paging);
 2185                 /*
 2186                  * If the task switch was due to a CALL, JMP, IRET, software
 2187                  * interrupt (INT n) or software exception (INT3, INTO),
 2188                  * then the saved %rip references the instruction that caused
 2189                  * the task switch. The instruction length field in the VMCS
 2190                  * is valid in this case.
 2191                  *
 2192                  * In all other cases (e.g., NMI, hardware exception) the
 2193                  * saved %rip is one that would have been saved in the old TSS
 2194                  * had the task switch completed normally so the instruction
 2195                  * length field is not needed in this case and is explicitly
 2196                  * set to 0.
 2197                  */
 2198                 if (ts->reason == TSR_IDT_GATE) {
 2199                         KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 2200                             ("invalid idtvec_info %#x for IDT task switch",
 2201                             idtvec_info));
 2202                         intr_type = idtvec_info & VMCS_INTR_T_MASK;
 2203                         if (intr_type != VMCS_INTR_T_SWINTR &&
 2204                             intr_type != VMCS_INTR_T_SWEXCEPTION &&
 2205                             intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 2206                                 /* Task switch triggered by external event */
 2207                                 ts->ext = 1;
 2208                                 vmexit->inst_length = 0;
 2209                                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 2210                                         ts->errcode_valid = 1;
 2211                                         ts->errcode = vmcs_idt_vectoring_err();
 2212                                 }
 2213                         }
 2214                 }
 2215                 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 2216                 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 2217                     "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 2218                     ts->ext ? "external" : "internal",
 2219                     ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 2220                 break;
 2221         case EXIT_REASON_CR_ACCESS:
 2222                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 2223                 switch (qual & 0xf) {
 2224                 case 0:
 2225                         handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 2226                         break;
 2227                 case 4:
 2228                         handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 2229                         break;
 2230                 case 8:
 2231                         handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 2232                         break;
 2233                 }
 2234                 break;
 2235         case EXIT_REASON_RDMSR:
 2236                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 2237                 retu = false;
 2238                 ecx = vmxctx->guest_rcx;
 2239                 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 2240                 error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 2241                 if (error) {
 2242                         vmexit->exitcode = VM_EXITCODE_RDMSR;
 2243                         vmexit->u.msr.code = ecx;
 2244                 } else if (!retu) {
 2245                         handled = HANDLED;
 2246                 } else {
 2247                         /* Return to userspace with a valid exitcode */
 2248                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 2249                             ("emulate_rdmsr retu with bogus exitcode"));
 2250                 }
 2251                 break;
 2252         case EXIT_REASON_WRMSR:
 2253                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 2254                 retu = false;
 2255                 eax = vmxctx->guest_rax;
 2256                 ecx = vmxctx->guest_rcx;
 2257                 edx = vmxctx->guest_rdx;
 2258                 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 2259                     ecx, (uint64_t)edx << 32 | eax);
 2260                 error = emulate_wrmsr(vmx, vcpu, ecx,
 2261                     (uint64_t)edx << 32 | eax, &retu);
 2262                 if (error) {
 2263                         vmexit->exitcode = VM_EXITCODE_WRMSR;
 2264                         vmexit->u.msr.code = ecx;
 2265                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 2266                 } else if (!retu) {
 2267                         handled = HANDLED;
 2268                 } else {
 2269                         /* Return to userspace with a valid exitcode */
 2270                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 2271                             ("emulate_wrmsr retu with bogus exitcode"));
 2272                 }
 2273                 break;
 2274         case EXIT_REASON_HLT:
 2275                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 2276                 vmexit->exitcode = VM_EXITCODE_HLT;
 2277                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 2278                 break;
 2279         case EXIT_REASON_MTF:
 2280                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 2281                 vmexit->exitcode = VM_EXITCODE_MTRAP;
 2282                 vmexit->inst_length = 0;
 2283                 break;
 2284         case EXIT_REASON_PAUSE:
 2285                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 2286                 vmexit->exitcode = VM_EXITCODE_PAUSE;
 2287                 break;
 2288         case EXIT_REASON_INTR_WINDOW:
 2289                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 2290                 vmx_clear_int_window_exiting(vmx, vcpu);
 2291                 return (1);
 2292         case EXIT_REASON_EXT_INTR:
 2293                 /*
 2294                  * External interrupts serve only to cause VM exits and allow
 2295                  * the host interrupt handler to run.
 2296                  *
 2297                  * If this external interrupt triggers a virtual interrupt
 2298                  * to a VM, then that state will be recorded by the
 2299                  * host interrupt handler in the VM's softc. We will inject
 2300                  * this virtual interrupt during the subsequent VM enter.
 2301                  */
 2302                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2303 
 2304                 /*
 2305                  * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 2306                  * This appears to be a bug in VMware Fusion?
 2307                  */
 2308                 if (!(intr_info & VMCS_INTR_VALID))
 2309                         return (1);
 2310                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 2311                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 2312                     ("VM exit interruption info invalid: %#x", intr_info));
 2313                 vmx_trigger_hostintr(intr_info & 0xff);
 2314 
 2315                 /*
 2316                  * This is special. We want to treat this as an 'handled'
 2317                  * VM-exit but not increment the instruction pointer.
 2318                  */
 2319                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 2320                 return (1);
 2321         case EXIT_REASON_NMI_WINDOW:
 2322                 /* Exit to allow the pending virtual NMI to be injected */
 2323                 if (vm_nmi_pending(vmx->vm, vcpu))
 2324                         vmx_inject_nmi(vmx, vcpu);
 2325                 vmx_clear_nmi_window_exiting(vmx, vcpu);
 2326                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 2327                 return (1);
 2328         case EXIT_REASON_INOUT:
 2329                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 2330                 vmexit->exitcode = VM_EXITCODE_INOUT;
 2331                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
 2332                 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 2333                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 2334                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 2335                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
 2336                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 2337                 if (vmexit->u.inout.string) {
 2338                         inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 2339                         vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 2340                         vis = &vmexit->u.inout_str;
 2341                         vmx_paging_info(&vis->paging);
 2342                         vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 2343                         vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 2344                         vis->index = inout_str_index(vmx, vcpu, in);
 2345                         vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 2346                         vis->addrsize = inout_str_addrsize(inst_info);
 2347                         inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 2348                 }
 2349                 break;
 2350         case EXIT_REASON_CPUID:
 2351                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 2352                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 2353                 break;
 2354         case EXIT_REASON_EXCEPTION:
 2355                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 2356                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2357                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 2358                     ("VM exit interruption info invalid: %#x", intr_info));
 2359 
 2360                 intr_vec = intr_info & 0xff;
 2361                 intr_type = intr_info & VMCS_INTR_T_MASK;
 2362 
 2363                 /*
 2364                  * If Virtual NMIs control is 1 and the VM-exit is due to a
 2365                  * fault encountered during the execution of IRET then we must
 2366                  * restore the state of "virtual-NMI blocking" before resuming
 2367                  * the guest.
 2368                  *
 2369                  * See "Resuming Guest Software after Handling an Exception".
 2370                  * See "Information for VM Exits Due to Vectored Events".
 2371                  */
 2372                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 2373                     (intr_vec != IDT_DF) &&
 2374                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 2375                         vmx_restore_nmi_blocking(vmx, vcpu);
 2376 
 2377                 /*
 2378                  * The NMI has already been handled in vmx_exit_handle_nmi().
 2379                  */
 2380                 if (intr_type == VMCS_INTR_T_NMI)
 2381                         return (1);
 2382 
 2383                 /*
 2384                  * Call the machine check handler by hand. Also don't reflect
 2385                  * the machine check back into the guest.
 2386                  */
 2387                 if (intr_vec == IDT_MC) {
 2388                         VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 2389                         __asm __volatile("int $18");
 2390                         return (1);
 2391                 }
 2392 
 2393                 if (intr_vec == IDT_PF) {
 2394                         error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 2395                         KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 2396                             __func__, error));
 2397                 }
 2398 
 2399                 /*
 2400                  * Software exceptions exhibit trap-like behavior. This in
 2401                  * turn requires populating the VM-entry instruction length
 2402                  * so that the %rip in the trap frame is past the INT3/INTO
 2403                  * instruction.
 2404                  */
 2405                 if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 2406                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 2407 
 2408                 /* Reflect all other exceptions back into the guest */
 2409                 errcode_valid = errcode = 0;
 2410                 if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 2411                         errcode_valid = 1;
 2412                         errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 2413                 }
 2414                 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 2415                     "the guest", intr_vec, errcode);
 2416                 error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 2417                     errcode_valid, errcode, 0);
 2418                 KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 2419                     __func__, error));
 2420                 return (1);
 2421 
 2422         case EXIT_REASON_EPT_FAULT:
 2423                 /*
 2424                  * If 'gpa' lies within the address space allocated to
 2425                  * memory then this must be a nested page fault otherwise
 2426                  * this must be an instruction that accesses MMIO space.
 2427                  */
 2428                 gpa = vmcs_gpa();
 2429                 if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 2430                     apic_access_fault(vmx, vcpu, gpa)) {
 2431                         vmexit->exitcode = VM_EXITCODE_PAGING;
 2432                         vmexit->inst_length = 0;
 2433                         vmexit->u.paging.gpa = gpa;
 2434                         vmexit->u.paging.fault_type = ept_fault_type(qual);
 2435                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 2436                 } else if (ept_emulation_fault(qual)) {
 2437                         vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 2438                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 2439                 }
 2440                 /*
 2441                  * If Virtual NMIs control is 1 and the VM-exit is due to an
 2442                  * EPT fault during the execution of IRET then we must restore
 2443                  * the state of "virtual-NMI blocking" before resuming.
 2444                  *
 2445                  * See description of "NMI unblocking due to IRET" in
 2446                  * "Exit Qualification for EPT Violations".
 2447                  */
 2448                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 2449                     (qual & EXIT_QUAL_NMIUDTI) != 0)
 2450                         vmx_restore_nmi_blocking(vmx, vcpu);
 2451                 break;
 2452         case EXIT_REASON_VIRTUALIZED_EOI:
 2453                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 2454                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 2455                 vmexit->inst_length = 0;        /* trap-like */
 2456                 break;
 2457         case EXIT_REASON_APIC_ACCESS:
 2458                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 2459                 break;
 2460         case EXIT_REASON_APIC_WRITE:
 2461                 /*
 2462                  * APIC-write VM exit is trap-like so the %rip is already
 2463                  * pointing to the next instruction.
 2464                  */
 2465                 vmexit->inst_length = 0;
 2466                 vlapic = vm_lapic(vmx->vm, vcpu);
 2467                 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 2468                 break;
 2469         case EXIT_REASON_XSETBV:
 2470                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 2471                 break;
 2472         case EXIT_REASON_MONITOR:
 2473                 vmexit->exitcode = VM_EXITCODE_MONITOR;
 2474                 break;
 2475         case EXIT_REASON_MWAIT:
 2476                 vmexit->exitcode = VM_EXITCODE_MWAIT;
 2477                 break;
 2478         default:
 2479                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 2480                 break;
 2481         }
 2482 
 2483         if (handled) {
 2484                 /*
 2485                  * It is possible that control is returned to userland
 2486                  * even though we were able to handle the VM exit in the
 2487                  * kernel.
 2488                  *
 2489                  * In such a case we want to make sure that the userland
 2490                  * restarts guest execution at the instruction *after*
 2491                  * the one we just processed. Therefore we update the
 2492                  * guest rip in the VMCS and in 'vmexit'.
 2493                  */
 2494                 vmexit->rip += vmexit->inst_length;
 2495                 vmexit->inst_length = 0;
 2496                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 2497         } else {
 2498                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 2499                         /*
 2500                          * If this VM exit was not claimed by anybody then
 2501                          * treat it as a generic VMX exit.
 2502                          */
 2503                         vmexit->exitcode = VM_EXITCODE_VMX;
 2504                         vmexit->u.vmx.status = VM_SUCCESS;
 2505                         vmexit->u.vmx.inst_type = 0;
 2506                         vmexit->u.vmx.inst_error = 0;
 2507                 } else {
 2508                         /*
 2509                          * The exitcode and collateral have been populated.
 2510                          * The VM exit will be processed further in userland.
 2511                          */
 2512                 }
 2513         }
 2514         return (handled);
 2515 }
 2516 
 2517 static __inline void
 2518 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 2519 {
 2520 
 2521         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 2522             ("vmx_exit_inst_error: invalid inst_fail_status %d",
 2523             vmxctx->inst_fail_status));
 2524 
 2525         vmexit->inst_length = 0;
 2526         vmexit->exitcode = VM_EXITCODE_VMX;
 2527         vmexit->u.vmx.status = vmxctx->inst_fail_status;
 2528         vmexit->u.vmx.inst_error = vmcs_instruction_error();
 2529         vmexit->u.vmx.exit_reason = ~0;
 2530         vmexit->u.vmx.exit_qualification = ~0;
 2531 
 2532         switch (rc) {
 2533         case VMX_VMRESUME_ERROR:
 2534         case VMX_VMLAUNCH_ERROR:
 2535         case VMX_INVEPT_ERROR:
 2536                 vmexit->u.vmx.inst_type = rc;
 2537                 break;
 2538         default:
 2539                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 2540         }
 2541 }
 2542 
 2543 /*
 2544  * If the NMI-exiting VM execution control is set to '1' then an NMI in
 2545  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
 2546  * sufficient to simply vector to the NMI handler via a software interrupt.
 2547  * However, this must be done before maskable interrupts are enabled
 2548  * otherwise the "iret" issued by an interrupt handler will incorrectly
 2549  * clear NMI blocking.
 2550  */
 2551 static __inline void
 2552 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 2553 {
 2554         uint32_t intr_info;
 2555 
 2556         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 2557 
 2558         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 2559                 return;
 2560 
 2561         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2562         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 2563             ("VM exit interruption info invalid: %#x", intr_info));
 2564 
 2565         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 2566                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 2567                     "to NMI has invalid vector: %#x", intr_info));
 2568                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 2569                 __asm __volatile("int $2");
 2570         }
 2571 }
 2572 
 2573 static int
 2574 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
 2575     struct vm_eventinfo *evinfo)
 2576 {
 2577         int rc, handled, launched;
 2578         struct vmx *vmx;
 2579         struct vm *vm;
 2580         struct vmxctx *vmxctx;
 2581         struct vmcs *vmcs;
 2582         struct vm_exit *vmexit;
 2583         struct vlapic *vlapic;
 2584         uint32_t exit_reason;
 2585 
 2586         vmx = arg;
 2587         vm = vmx->vm;
 2588         vmcs = &vmx->vmcs[vcpu];
 2589         vmxctx = &vmx->ctx[vcpu];
 2590         vlapic = vm_lapic(vm, vcpu);
 2591         vmexit = vm_exitinfo(vm, vcpu);
 2592         launched = 0;
 2593 
 2594         KASSERT(vmxctx->pmap == pmap,
 2595             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 2596 
 2597         vmx_msr_guest_enter(vmx, vcpu);
 2598 
 2599         VMPTRLD(vmcs);
 2600 
 2601         /*
 2602          * XXX
 2603          * We do this every time because we may setup the virtual machine
 2604          * from a different process than the one that actually runs it.
 2605          *
 2606          * If the life of a virtual machine was spent entirely in the context
 2607          * of a single process we could do this once in vmx_vminit().
 2608          */
 2609         vmcs_write(VMCS_HOST_CR3, rcr3());
 2610 
 2611         vmcs_write(VMCS_GUEST_RIP, rip);
 2612         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 2613         do {
 2614                 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 2615                     "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 2616 
 2617                 handled = UNHANDLED;
 2618                 /*
 2619                  * Interrupts are disabled from this point on until the
 2620                  * guest starts executing. This is done for the following
 2621                  * reasons:
 2622                  *
 2623                  * If an AST is asserted on this thread after the check below,
 2624                  * then the IPI_AST notification will not be lost, because it
 2625                  * will cause a VM exit due to external interrupt as soon as
 2626                  * the guest state is loaded.
 2627                  *
 2628                  * A posted interrupt after 'vmx_inject_interrupts()' will
 2629                  * not be "lost" because it will be held pending in the host
 2630                  * APIC because interrupts are disabled. The pending interrupt
 2631                  * will be recognized as soon as the guest state is loaded.
 2632                  *
 2633                  * The same reasoning applies to the IPI generated by
 2634                  * pmap_invalidate_ept().
 2635                  */
 2636                 disable_intr();
 2637                 vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 2638 
 2639                 /*
 2640                  * Check for vcpu suspension after injecting events because
 2641                  * vmx_inject_interrupts() can suspend the vcpu due to a
 2642                  * triple fault.
 2643                  */
 2644                 if (vcpu_suspended(evinfo)) {
 2645                         enable_intr();
 2646                         vm_exit_suspended(vmx->vm, vcpu, rip);
 2647                         break;
 2648                 }
 2649 
 2650                 if (vcpu_rendezvous_pending(evinfo)) {
 2651                         enable_intr();
 2652                         vm_exit_rendezvous(vmx->vm, vcpu, rip);
 2653                         break;
 2654                 }
 2655 
 2656                 if (vcpu_reqidle(evinfo)) {
 2657                         enable_intr();
 2658                         vm_exit_reqidle(vmx->vm, vcpu, rip);
 2659                         break;
 2660                 }
 2661 
 2662                 if (vcpu_should_yield(vm, vcpu)) {
 2663                         enable_intr();
 2664                         vm_exit_astpending(vmx->vm, vcpu, rip);
 2665                         vmx_astpending_trace(vmx, vcpu, rip);
 2666                         handled = HANDLED;
 2667                         break;
 2668                 }
 2669 
 2670                 vmx_run_trace(vmx, vcpu);
 2671                 rc = vmx_enter_guest(vmxctx, vmx, launched);
 2672 
 2673                 /* Collect some information for VM exit processing */
 2674                 vmexit->rip = rip = vmcs_guest_rip();
 2675                 vmexit->inst_length = vmexit_instruction_length();
 2676                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 2677                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 2678 
 2679                 /* Update 'nextrip' */
 2680                 vmx->state[vcpu].nextrip = rip;
 2681 
 2682                 if (rc == VMX_GUEST_VMEXIT) {
 2683                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 2684                         enable_intr();
 2685                         handled = vmx_exit_process(vmx, vcpu, vmexit);
 2686                 } else {
 2687                         enable_intr();
 2688                         vmx_exit_inst_error(vmxctx, rc, vmexit);
 2689                 }
 2690                 launched = 1;
 2691                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 2692                 rip = vmexit->rip;
 2693         } while (handled);
 2694 
 2695         /*
 2696          * If a VM exit has been handled then the exitcode must be BOGUS
 2697          * If a VM exit is not handled then the exitcode must not be BOGUS
 2698          */
 2699         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 2700             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 2701                 panic("Mismatch between handled (%d) and exitcode (%d)",
 2702                       handled, vmexit->exitcode);
 2703         }
 2704 
 2705         if (!handled)
 2706                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 2707 
 2708         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 2709             vmexit->exitcode);
 2710 
 2711         VMCLEAR(vmcs);
 2712         vmx_msr_guest_exit(vmx, vcpu);
 2713 
 2714         return (0);
 2715 }
 2716 
 2717 static void
 2718 vmx_vmcleanup(void *arg)
 2719 {
 2720         int i;
 2721         struct vmx *vmx = arg;
 2722 
 2723         if (apic_access_virtualization(vmx, 0))
 2724                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 2725 
 2726         for (i = 0; i < VM_MAXCPU; i++)
 2727                 vpid_free(vmx->state[i].vpid);
 2728 
 2729         free(vmx, M_VMX);
 2730 
 2731         return;
 2732 }
 2733 
 2734 static register_t *
 2735 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 2736 {
 2737 
 2738         switch (reg) {
 2739         case VM_REG_GUEST_RAX:
 2740                 return (&vmxctx->guest_rax);
 2741         case VM_REG_GUEST_RBX:
 2742                 return (&vmxctx->guest_rbx);
 2743         case VM_REG_GUEST_RCX:
 2744                 return (&vmxctx->guest_rcx);
 2745         case VM_REG_GUEST_RDX:
 2746                 return (&vmxctx->guest_rdx);
 2747         case VM_REG_GUEST_RSI:
 2748                 return (&vmxctx->guest_rsi);
 2749         case VM_REG_GUEST_RDI:
 2750                 return (&vmxctx->guest_rdi);
 2751         case VM_REG_GUEST_RBP:
 2752                 return (&vmxctx->guest_rbp);
 2753         case VM_REG_GUEST_R8:
 2754                 return (&vmxctx->guest_r8);
 2755         case VM_REG_GUEST_R9:
 2756                 return (&vmxctx->guest_r9);
 2757         case VM_REG_GUEST_R10:
 2758                 return (&vmxctx->guest_r10);
 2759         case VM_REG_GUEST_R11:
 2760                 return (&vmxctx->guest_r11);
 2761         case VM_REG_GUEST_R12:
 2762                 return (&vmxctx->guest_r12);
 2763         case VM_REG_GUEST_R13:
 2764                 return (&vmxctx->guest_r13);
 2765         case VM_REG_GUEST_R14:
 2766                 return (&vmxctx->guest_r14);
 2767         case VM_REG_GUEST_R15:
 2768                 return (&vmxctx->guest_r15);
 2769         case VM_REG_GUEST_CR2:
 2770                 return (&vmxctx->guest_cr2);
 2771         default:
 2772                 break;
 2773         }
 2774         return (NULL);
 2775 }
 2776 
 2777 static int
 2778 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 2779 {
 2780         register_t *regp;
 2781 
 2782         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 2783                 *retval = *regp;
 2784                 return (0);
 2785         } else
 2786                 return (EINVAL);
 2787 }
 2788 
 2789 static int
 2790 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 2791 {
 2792         register_t *regp;
 2793 
 2794         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 2795                 *regp = val;
 2796                 return (0);
 2797         } else
 2798                 return (EINVAL);
 2799 }
 2800 
 2801 static int
 2802 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 2803 {
 2804         uint64_t gi;
 2805         int error;
 2806 
 2807         error = vmcs_getreg(&vmx->vmcs[vcpu], running, 
 2808             VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 2809         *retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 2810         return (error);
 2811 }
 2812 
 2813 static int
 2814 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 2815 {
 2816         struct vmcs *vmcs;
 2817         uint64_t gi;
 2818         int error, ident;
 2819 
 2820         /*
 2821          * Forcing the vcpu into an interrupt shadow is not supported.
 2822          */
 2823         if (val) {
 2824                 error = EINVAL;
 2825                 goto done;
 2826         }
 2827 
 2828         vmcs = &vmx->vmcs[vcpu];
 2829         ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 2830         error = vmcs_getreg(vmcs, running, ident, &gi);
 2831         if (error == 0) {
 2832                 gi &= ~HWINTR_BLOCKING;
 2833                 error = vmcs_setreg(vmcs, running, ident, gi);
 2834         }
 2835 done:
 2836         VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 2837             error ? "failed" : "succeeded");
 2838         return (error);
 2839 }
 2840 
 2841 static int
 2842 vmx_shadow_reg(int reg)
 2843 {
 2844         int shreg;
 2845 
 2846         shreg = -1;
 2847 
 2848         switch (reg) {
 2849         case VM_REG_GUEST_CR0:
 2850                 shreg = VMCS_CR0_SHADOW;
 2851                 break;
 2852         case VM_REG_GUEST_CR4:
 2853                 shreg = VMCS_CR4_SHADOW;
 2854                 break;
 2855         default:
 2856                 break;
 2857         }
 2858 
 2859         return (shreg);
 2860 }
 2861 
 2862 static int
 2863 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 2864 {
 2865         int running, hostcpu;
 2866         struct vmx *vmx = arg;
 2867 
 2868         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 2869         if (running && hostcpu != curcpu)
 2870                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 2871 
 2872         if (reg == VM_REG_GUEST_INTR_SHADOW)
 2873                 return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 2874 
 2875         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 2876                 return (0);
 2877 
 2878         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 2879 }
 2880 
 2881 static int
 2882 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 2883 {
 2884         int error, hostcpu, running, shadow;
 2885         uint64_t ctls;
 2886         pmap_t pmap;
 2887         struct vmx *vmx = arg;
 2888 
 2889         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 2890         if (running && hostcpu != curcpu)
 2891                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 2892 
 2893         if (reg == VM_REG_GUEST_INTR_SHADOW)
 2894                 return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 2895 
 2896         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 2897                 return (0);
 2898 
 2899         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 2900 
 2901         if (error == 0) {
 2902                 /*
 2903                  * If the "load EFER" VM-entry control is 1 then the
 2904                  * value of EFER.LMA must be identical to "IA-32e mode guest"
 2905                  * bit in the VM-entry control.
 2906                  */
 2907                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 2908                     (reg == VM_REG_GUEST_EFER)) {
 2909                         vmcs_getreg(&vmx->vmcs[vcpu], running,
 2910                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 2911                         if (val & EFER_LMA)
 2912                                 ctls |= VM_ENTRY_GUEST_LMA;
 2913                         else
 2914                                 ctls &= ~VM_ENTRY_GUEST_LMA;
 2915                         vmcs_setreg(&vmx->vmcs[vcpu], running,
 2916                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 2917                 }
 2918 
 2919                 shadow = vmx_shadow_reg(reg);
 2920                 if (shadow > 0) {
 2921                         /*
 2922                          * Store the unmodified value in the shadow
 2923                          */                     
 2924                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 2925                                     VMCS_IDENT(shadow), val);
 2926                 }
 2927 
 2928                 if (reg == VM_REG_GUEST_CR3) {
 2929                         /*
 2930                          * Invalidate the guest vcpu's TLB mappings to emulate
 2931                          * the behavior of updating %cr3.
 2932                          *
 2933                          * XXX the processor retains global mappings when %cr3
 2934                          * is updated but vmx_invvpid() does not.
 2935                          */
 2936                         pmap = vmx->ctx[vcpu].pmap;
 2937                         vmx_invvpid(vmx, vcpu, pmap, running);
 2938                 }
 2939         }
 2940 
 2941         return (error);
 2942 }
 2943 
 2944 static int
 2945 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 2946 {
 2947         int hostcpu, running;
 2948         struct vmx *vmx = arg;
 2949 
 2950         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 2951         if (running && hostcpu != curcpu)
 2952                 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 2953 
 2954         return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 2955 }
 2956 
 2957 static int
 2958 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 2959 {
 2960         int hostcpu, running;
 2961         struct vmx *vmx = arg;
 2962 
 2963         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 2964         if (running && hostcpu != curcpu)
 2965                 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 2966 
 2967         return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 2968 }
 2969 
 2970 static int
 2971 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 2972 {
 2973         struct vmx *vmx = arg;
 2974         int vcap;
 2975         int ret;
 2976 
 2977         ret = ENOENT;
 2978 
 2979         vcap = vmx->cap[vcpu].set;
 2980 
 2981         switch (type) {
 2982         case VM_CAP_HALT_EXIT:
 2983                 if (cap_halt_exit)
 2984                         ret = 0;
 2985                 break;
 2986         case VM_CAP_PAUSE_EXIT:
 2987                 if (cap_pause_exit)
 2988                         ret = 0;
 2989                 break;
 2990         case VM_CAP_MTRAP_EXIT:
 2991                 if (cap_monitor_trap)
 2992                         ret = 0;
 2993                 break;
 2994         case VM_CAP_UNRESTRICTED_GUEST:
 2995                 if (cap_unrestricted_guest)
 2996                         ret = 0;
 2997                 break;
 2998         case VM_CAP_ENABLE_INVPCID:
 2999                 if (cap_invpcid)
 3000                         ret = 0;
 3001                 break;
 3002         default:
 3003                 break;
 3004         }
 3005 
 3006         if (ret == 0)
 3007                 *retval = (vcap & (1 << type)) ? 1 : 0;
 3008 
 3009         return (ret);
 3010 }
 3011 
 3012 static int
 3013 vmx_setcap(void *arg, int vcpu, int type, int val)
 3014 {
 3015         struct vmx *vmx = arg;
 3016         struct vmcs *vmcs = &vmx->vmcs[vcpu];
 3017         uint32_t baseval;
 3018         uint32_t *pptr;
 3019         int error;
 3020         int flag;
 3021         int reg;
 3022         int retval;
 3023 
 3024         retval = ENOENT;
 3025         pptr = NULL;
 3026 
 3027         switch (type) {
 3028         case VM_CAP_HALT_EXIT:
 3029                 if (cap_halt_exit) {
 3030                         retval = 0;
 3031                         pptr = &vmx->cap[vcpu].proc_ctls;
 3032                         baseval = *pptr;
 3033                         flag = PROCBASED_HLT_EXITING;
 3034                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3035                 }
 3036                 break;
 3037         case VM_CAP_MTRAP_EXIT:
 3038                 if (cap_monitor_trap) {
 3039                         retval = 0;
 3040                         pptr = &vmx->cap[vcpu].proc_ctls;
 3041                         baseval = *pptr;
 3042                         flag = PROCBASED_MTF;
 3043                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3044                 }
 3045                 break;
 3046         case VM_CAP_PAUSE_EXIT:
 3047                 if (cap_pause_exit) {
 3048                         retval = 0;
 3049                         pptr = &vmx->cap[vcpu].proc_ctls;
 3050                         baseval = *pptr;
 3051                         flag = PROCBASED_PAUSE_EXITING;
 3052                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3053                 }
 3054                 break;
 3055         case VM_CAP_UNRESTRICTED_GUEST:
 3056                 if (cap_unrestricted_guest) {
 3057                         retval = 0;
 3058                         pptr = &vmx->cap[vcpu].proc_ctls2;
 3059                         baseval = *pptr;
 3060                         flag = PROCBASED2_UNRESTRICTED_GUEST;
 3061                         reg = VMCS_SEC_PROC_BASED_CTLS;
 3062                 }
 3063                 break;
 3064         case VM_CAP_ENABLE_INVPCID:
 3065                 if (cap_invpcid) {
 3066                         retval = 0;
 3067                         pptr = &vmx->cap[vcpu].proc_ctls2;
 3068                         baseval = *pptr;
 3069                         flag = PROCBASED2_ENABLE_INVPCID;
 3070                         reg = VMCS_SEC_PROC_BASED_CTLS;
 3071                 }
 3072                 break;
 3073         default:
 3074                 break;
 3075         }
 3076 
 3077         if (retval == 0) {
 3078                 if (val) {
 3079                         baseval |= flag;
 3080                 } else {
 3081                         baseval &= ~flag;
 3082                 }
 3083                 VMPTRLD(vmcs);
 3084                 error = vmwrite(reg, baseval);
 3085                 VMCLEAR(vmcs);
 3086 
 3087                 if (error) {
 3088                         retval = error;
 3089                 } else {
 3090                         /*
 3091                          * Update optional stored flags, and record
 3092                          * setting
 3093                          */
 3094                         if (pptr != NULL) {
 3095                                 *pptr = baseval;
 3096                         }
 3097 
 3098                         if (val) {
 3099                                 vmx->cap[vcpu].set |= (1 << type);
 3100                         } else {
 3101                                 vmx->cap[vcpu].set &= ~(1 << type);
 3102                         }
 3103                 }
 3104         }
 3105 
 3106         return (retval);
 3107 }
 3108 
 3109 struct vlapic_vtx {
 3110         struct vlapic   vlapic;
 3111         struct pir_desc *pir_desc;
 3112         struct vmx      *vmx;
 3113 };
 3114 
 3115 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
 3116 do {                                                                    \
 3117         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
 3118             level ? "level" : "edge", vector);                          \
 3119         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
 3120         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
 3121         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
 3122         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
 3123         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 3124 } while (0)
 3125 
 3126 /*
 3127  * vlapic->ops handlers that utilize the APICv hardware assist described in
 3128  * Chapter 29 of the Intel SDM.
 3129  */
 3130 static int
 3131 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 3132 {
 3133         struct vlapic_vtx *vlapic_vtx;
 3134         struct pir_desc *pir_desc;
 3135         uint64_t mask;
 3136         int idx, notify;
 3137 
 3138         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3139         pir_desc = vlapic_vtx->pir_desc;
 3140 
 3141         /*
 3142          * Keep track of interrupt requests in the PIR descriptor. This is
 3143          * because the virtual APIC page pointed to by the VMCS cannot be
 3144          * modified if the vcpu is running.
 3145          */
 3146         idx = vector / 64;
 3147         mask = 1UL << (vector % 64);
 3148         atomic_set_long(&pir_desc->pir[idx], mask);
 3149         notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
 3150 
 3151         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 3152             level, "vmx_set_intr_ready");
 3153         return (notify);
 3154 }
 3155 
 3156 static int
 3157 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 3158 {
 3159         struct vlapic_vtx *vlapic_vtx;
 3160         struct pir_desc *pir_desc;
 3161         struct LAPIC *lapic;
 3162         uint64_t pending, pirval;
 3163         uint32_t ppr, vpr;
 3164         int i;
 3165 
 3166         /*
 3167          * This function is only expected to be called from the 'HLT' exit
 3168          * handler which does not care about the vector that is pending.
 3169          */
 3170         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 3171 
 3172         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3173         pir_desc = vlapic_vtx->pir_desc;
 3174 
 3175         pending = atomic_load_acq_long(&pir_desc->pending);
 3176         if (!pending)
 3177                 return (0);     /* common case */
 3178 
 3179         /*
 3180          * If there is an interrupt pending then it will be recognized only
 3181          * if its priority is greater than the processor priority.
 3182          *
 3183          * Special case: if the processor priority is zero then any pending
 3184          * interrupt will be recognized.
 3185          */
 3186         lapic = vlapic->apic_page;
 3187         ppr = lapic->ppr & 0xf0;
 3188         if (ppr == 0)
 3189                 return (1);
 3190 
 3191         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 3192             lapic->ppr);
 3193 
 3194         for (i = 3; i >= 0; i--) {
 3195                 pirval = pir_desc->pir[i];
 3196                 if (pirval != 0) {
 3197                         vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
 3198                         return (vpr > ppr);
 3199                 }
 3200         }
 3201         return (0);
 3202 }
 3203 
 3204 static void
 3205 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 3206 {
 3207 
 3208         panic("vmx_intr_accepted: not expected to be called");
 3209 }
 3210 
 3211 static void
 3212 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 3213 {
 3214         struct vlapic_vtx *vlapic_vtx;
 3215         struct vmx *vmx;
 3216         struct vmcs *vmcs;
 3217         uint64_t mask, val;
 3218 
 3219         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 3220         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 3221             ("vmx_set_tmr: vcpu cannot be running"));
 3222 
 3223         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3224         vmx = vlapic_vtx->vmx;
 3225         vmcs = &vmx->vmcs[vlapic->vcpuid];
 3226         mask = 1UL << (vector % 64);
 3227 
 3228         VMPTRLD(vmcs);
 3229         val = vmcs_read(VMCS_EOI_EXIT(vector));
 3230         if (level)
 3231                 val |= mask;
 3232         else
 3233                 val &= ~mask;
 3234         vmcs_write(VMCS_EOI_EXIT(vector), val);
 3235         VMCLEAR(vmcs);
 3236 }
 3237 
 3238 static void
 3239 vmx_enable_x2apic_mode(struct vlapic *vlapic)
 3240 {
 3241         struct vmx *vmx;
 3242         struct vmcs *vmcs;
 3243         uint32_t proc_ctls2;
 3244         int vcpuid, error;
 3245 
 3246         vcpuid = vlapic->vcpuid;
 3247         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 3248         vmcs = &vmx->vmcs[vcpuid];
 3249 
 3250         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 3251         KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 3252             ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 3253 
 3254         proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 3255         proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 3256         vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 3257 
 3258         VMPTRLD(vmcs);
 3259         vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 3260         VMCLEAR(vmcs);
 3261 
 3262         if (vlapic->vcpuid == 0) {
 3263                 /*
 3264                  * The nested page table mappings are shared by all vcpus
 3265                  * so unmap the APIC access page just once.
 3266                  */
 3267                 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 3268                 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 3269                     __func__, error));
 3270 
 3271                 /*
 3272                  * The MSR bitmap is shared by all vcpus so modify it only
 3273                  * once in the context of vcpu 0.
 3274                  */
 3275                 error = vmx_allow_x2apic_msrs(vmx);
 3276                 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 3277                     __func__, error));
 3278         }
 3279 }
 3280 
 3281 static void
 3282 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 3283 {
 3284 
 3285         ipi_cpu(hostcpu, pirvec);
 3286 }
 3287 
 3288 /*
 3289  * Transfer the pending interrupts in the PIR descriptor to the IRR
 3290  * in the virtual APIC page.
 3291  */
 3292 static void
 3293 vmx_inject_pir(struct vlapic *vlapic)
 3294 {
 3295         struct vlapic_vtx *vlapic_vtx;
 3296         struct pir_desc *pir_desc;
 3297         struct LAPIC *lapic;
 3298         uint64_t val, pirval;
 3299         int rvi, pirbase = -1;
 3300         uint16_t intr_status_old, intr_status_new;
 3301 
 3302         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3303         pir_desc = vlapic_vtx->pir_desc;
 3304         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 3305                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 3306                     "no posted interrupt pending");
 3307                 return;
 3308         }
 3309 
 3310         pirval = 0;
 3311         pirbase = -1;
 3312         lapic = vlapic->apic_page;
 3313 
 3314         val = atomic_readandclear_long(&pir_desc->pir[0]);
 3315         if (val != 0) {
 3316                 lapic->irr0 |= val;
 3317                 lapic->irr1 |= val >> 32;
 3318                 pirbase = 0;
 3319                 pirval = val;
 3320         }
 3321 
 3322         val = atomic_readandclear_long(&pir_desc->pir[1]);
 3323         if (val != 0) {
 3324                 lapic->irr2 |= val;
 3325                 lapic->irr3 |= val >> 32;
 3326                 pirbase = 64;
 3327                 pirval = val;
 3328         }
 3329 
 3330         val = atomic_readandclear_long(&pir_desc->pir[2]);
 3331         if (val != 0) {
 3332                 lapic->irr4 |= val;
 3333                 lapic->irr5 |= val >> 32;
 3334                 pirbase = 128;
 3335                 pirval = val;
 3336         }
 3337 
 3338         val = atomic_readandclear_long(&pir_desc->pir[3]);
 3339         if (val != 0) {
 3340                 lapic->irr6 |= val;
 3341                 lapic->irr7 |= val >> 32;
 3342                 pirbase = 192;
 3343                 pirval = val;
 3344         }
 3345 
 3346         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 3347 
 3348         /*
 3349          * Update RVI so the processor can evaluate pending virtual
 3350          * interrupts on VM-entry.
 3351          *
 3352          * It is possible for pirval to be 0 here, even though the
 3353          * pending bit has been set. The scenario is:
 3354          * CPU-Y is sending a posted interrupt to CPU-X, which
 3355          * is running a guest and processing posted interrupts in h/w.
 3356          * CPU-X will eventually exit and the state seen in s/w is
 3357          * the pending bit set, but no PIR bits set.
 3358          *
 3359          *      CPU-X                      CPU-Y
 3360          *   (vm running)                (host running)
 3361          *   rx posted interrupt
 3362          *   CLEAR pending bit
 3363          *                               SET PIR bit
 3364          *   READ/CLEAR PIR bits
 3365          *                               SET pending bit
 3366          *   (vm exit)
 3367          *   pending bit set, PIR 0
 3368          */
 3369         if (pirval != 0) {
 3370                 rvi = pirbase + flsl(pirval) - 1;
 3371                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 3372                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
 3373                 if (intr_status_new > intr_status_old) {
 3374                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 3375                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 3376                             "guest_intr_status changed from 0x%04x to 0x%04x",
 3377                             intr_status_old, intr_status_new);
 3378                 }
 3379         }
 3380 }
 3381 
 3382 static struct vlapic *
 3383 vmx_vlapic_init(void *arg, int vcpuid)
 3384 {
 3385         struct vmx *vmx;
 3386         struct vlapic *vlapic;
 3387         struct vlapic_vtx *vlapic_vtx;
 3388         
 3389         vmx = arg;
 3390 
 3391         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 3392         vlapic->vm = vmx->vm;
 3393         vlapic->vcpuid = vcpuid;
 3394         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 3395 
 3396         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3397         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 3398         vlapic_vtx->vmx = vmx;
 3399 
 3400         if (virtual_interrupt_delivery) {
 3401                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 3402                 vlapic->ops.pending_intr = vmx_pending_intr;
 3403                 vlapic->ops.intr_accepted = vmx_intr_accepted;
 3404                 vlapic->ops.set_tmr = vmx_set_tmr;
 3405                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
 3406         }
 3407 
 3408         if (posted_interrupts)
 3409                 vlapic->ops.post_intr = vmx_post_intr;
 3410 
 3411         vlapic_init(vlapic);
 3412 
 3413         return (vlapic);
 3414 }
 3415 
 3416 static void
 3417 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 3418 {
 3419 
 3420         vlapic_cleanup(vlapic);
 3421         free(vlapic, M_VLAPIC);
 3422 }
 3423 
 3424 struct vmm_ops vmm_ops_intel = {
 3425         vmx_init,
 3426         vmx_cleanup,
 3427         vmx_restore,
 3428         vmx_vminit,
 3429         vmx_run,
 3430         vmx_vmcleanup,
 3431         vmx_getreg,
 3432         vmx_setreg,
 3433         vmx_getdesc,
 3434         vmx_setdesc,
 3435         vmx_getcap,
 3436         vmx_setcap,
 3437         ept_vmspace_alloc,
 3438         ept_vmspace_free,
 3439         vmx_vlapic_init,
 3440         vmx_vlapic_cleanup,
 3441 };

Cache object: e632b73f48f8cfa9d8d5130d0f7330ac


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.