The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/intel/vmx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2011 NetApp, Inc.
    5  * All rights reserved.
    6  * Copyright (c) 2018 Joyent, Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  * $FreeBSD$
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __FBSDID("$FreeBSD$");
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/smp.h>
   38 #include <sys/kernel.h>
   39 #include <sys/malloc.h>
   40 #include <sys/pcpu.h>
   41 #include <sys/proc.h>
   42 #include <sys/sysctl.h>
   43 
   44 #include <vm/vm.h>
   45 #include <vm/pmap.h>
   46 
   47 #include <machine/psl.h>
   48 #include <machine/cpufunc.h>
   49 #include <machine/md_var.h>
   50 #include <machine/reg.h>
   51 #include <machine/segments.h>
   52 #include <machine/smp.h>
   53 #include <machine/specialreg.h>
   54 #include <machine/vmparam.h>
   55 
   56 #include <machine/vmm.h>
   57 #include <machine/vmm_dev.h>
   58 #include <machine/vmm_instruction_emul.h>
   59 #include "vmm_lapic.h"
   60 #include "vmm_host.h"
   61 #include "vmm_ioport.h"
   62 #include "vmm_ktr.h"
   63 #include "vmm_stat.h"
   64 #include "vatpic.h"
   65 #include "vlapic.h"
   66 #include "vlapic_priv.h"
   67 
   68 #include "ept.h"
   69 #include "vmx_cpufunc.h"
   70 #include "vmx.h"
   71 #include "vmx_msr.h"
   72 #include "x86.h"
   73 #include "vmx_controls.h"
   74 
   75 #define PINBASED_CTLS_ONE_SETTING                                       \
   76         (PINBASED_EXTINT_EXITING        |                               \
   77          PINBASED_NMI_EXITING           |                               \
   78          PINBASED_VIRTUAL_NMI)
   79 #define PINBASED_CTLS_ZERO_SETTING      0
   80 
   81 #define PROCBASED_CTLS_WINDOW_SETTING                                   \
   82         (PROCBASED_INT_WINDOW_EXITING   |                               \
   83          PROCBASED_NMI_WINDOW_EXITING)
   84 
   85 #define PROCBASED_CTLS_ONE_SETTING                                      \
   86         (PROCBASED_SECONDARY_CONTROLS   |                               \
   87          PROCBASED_MWAIT_EXITING        |                               \
   88          PROCBASED_MONITOR_EXITING      |                               \
   89          PROCBASED_IO_EXITING           |                               \
   90          PROCBASED_MSR_BITMAPS          |                               \
   91          PROCBASED_CTLS_WINDOW_SETTING  |                               \
   92          PROCBASED_CR8_LOAD_EXITING     |                               \
   93          PROCBASED_CR8_STORE_EXITING)
   94 #define PROCBASED_CTLS_ZERO_SETTING     \
   95         (PROCBASED_CR3_LOAD_EXITING |   \
   96         PROCBASED_CR3_STORE_EXITING |   \
   97         PROCBASED_IO_BITMAPS)
   98 
   99 #define PROCBASED_CTLS2_ONE_SETTING     PROCBASED2_ENABLE_EPT
  100 #define PROCBASED_CTLS2_ZERO_SETTING    0
  101 
  102 #define VM_EXIT_CTLS_ONE_SETTING                                        \
  103         (VM_EXIT_SAVE_DEBUG_CONTROLS            |                       \
  104         VM_EXIT_HOST_LMA                        |                       \
  105         VM_EXIT_SAVE_EFER                       |                       \
  106         VM_EXIT_LOAD_EFER                       |                       \
  107         VM_EXIT_ACKNOWLEDGE_INTERRUPT)
  108 
  109 #define VM_EXIT_CTLS_ZERO_SETTING       0
  110 
  111 #define VM_ENTRY_CTLS_ONE_SETTING                                       \
  112         (VM_ENTRY_LOAD_DEBUG_CONTROLS           |                       \
  113         VM_ENTRY_LOAD_EFER)
  114 
  115 #define VM_ENTRY_CTLS_ZERO_SETTING                                      \
  116         (VM_ENTRY_INTO_SMM                      |                       \
  117         VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
  118 
  119 #define HANDLED         1
  120 #define UNHANDLED       0
  121 
  122 static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
  123 static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
  124 
  125 SYSCTL_DECL(_hw_vmm);
  126 SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
  127 
  128 int vmxon_enabled[MAXCPU];
  129 static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
  130 
  131 static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
  132 static uint32_t exit_ctls, entry_ctls;
  133 
  134 static uint64_t cr0_ones_mask, cr0_zeros_mask;
  135 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
  136              &cr0_ones_mask, 0, NULL);
  137 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
  138              &cr0_zeros_mask, 0, NULL);
  139 
  140 static uint64_t cr4_ones_mask, cr4_zeros_mask;
  141 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
  142              &cr4_ones_mask, 0, NULL);
  143 SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
  144              &cr4_zeros_mask, 0, NULL);
  145 
  146 static int vmx_initialized;
  147 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
  148            &vmx_initialized, 0, "Intel VMX initialized");
  149 
  150 /*
  151  * Optional capabilities
  152  */
  153 static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL);
  154 
  155 static int cap_halt_exit;
  156 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0,
  157     "HLT triggers a VM-exit");
  158 
  159 static int cap_pause_exit;
  160 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit,
  161     0, "PAUSE triggers a VM-exit");
  162 
  163 static int cap_rdpid;
  164 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdpid, CTLFLAG_RD, &cap_rdpid, 0,
  165     "Guests are allowed to use RDPID");
  166 
  167 static int cap_rdtscp;
  168 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, rdtscp, CTLFLAG_RD, &cap_rdtscp, 0,
  169     "Guests are allowed to use RDTSCP");
  170 
  171 static int cap_unrestricted_guest;
  172 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD,
  173     &cap_unrestricted_guest, 0, "Unrestricted guests");
  174 
  175 static int cap_monitor_trap;
  176 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD,
  177     &cap_monitor_trap, 0, "Monitor trap flag");
  178 
  179 static int cap_invpcid;
  180 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid,
  181     0, "Guests are allowed to use INVPCID");
  182 
  183 static int tpr_shadowing;
  184 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, tpr_shadowing, CTLFLAG_RD,
  185     &tpr_shadowing, 0, "TPR shadowing support");
  186 
  187 static int virtual_interrupt_delivery;
  188 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
  189     &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
  190 
  191 static int posted_interrupts;
  192 SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD,
  193     &posted_interrupts, 0, "APICv posted interrupt support");
  194 
  195 static int pirvec = -1;
  196 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
  197     &pirvec, 0, "APICv posted interrupt vector");
  198 
  199 static struct unrhdr *vpid_unr;
  200 static u_int vpid_alloc_failed;
  201 SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
  202             &vpid_alloc_failed, 0, NULL);
  203 
  204 int guest_l1d_flush;
  205 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
  206     &guest_l1d_flush, 0, NULL);
  207 int guest_l1d_flush_sw;
  208 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
  209     &guest_l1d_flush_sw, 0, NULL);
  210 
  211 static struct msr_entry msr_load_list[1] __aligned(16);
  212 
  213 /*
  214  * The definitions of SDT probes for VMX.
  215  */
  216 
  217 SDT_PROBE_DEFINE3(vmm, vmx, exit, entry,
  218     "struct vmx *", "int", "struct vm_exit *");
  219 
  220 SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch,
  221     "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *");
  222 
  223 SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess,
  224     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
  225 
  226 SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr,
  227     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
  228 
  229 SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr,
  230     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t");
  231 
  232 SDT_PROBE_DEFINE3(vmm, vmx, exit, halt,
  233     "struct vmx *", "int", "struct vm_exit *");
  234 
  235 SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap,
  236     "struct vmx *", "int", "struct vm_exit *");
  237 
  238 SDT_PROBE_DEFINE3(vmm, vmx, exit, pause,
  239     "struct vmx *", "int", "struct vm_exit *");
  240 
  241 SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow,
  242     "struct vmx *", "int", "struct vm_exit *");
  243 
  244 SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt,
  245     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
  246 
  247 SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow,
  248     "struct vmx *", "int", "struct vm_exit *");
  249 
  250 SDT_PROBE_DEFINE3(vmm, vmx, exit, inout,
  251     "struct vmx *", "int", "struct vm_exit *");
  252 
  253 SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid,
  254     "struct vmx *", "int", "struct vm_exit *");
  255 
  256 SDT_PROBE_DEFINE5(vmm, vmx, exit, exception,
  257     "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int");
  258 
  259 SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault,
  260     "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t");
  261 
  262 SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault,
  263     "struct vmx *", "int", "struct vm_exit *", "uint64_t");
  264 
  265 SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi,
  266     "struct vmx *", "int", "struct vm_exit *");
  267 
  268 SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess,
  269     "struct vmx *", "int", "struct vm_exit *");
  270 
  271 SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite,
  272     "struct vmx *", "int", "struct vm_exit *", "struct vlapic *");
  273 
  274 SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv,
  275     "struct vmx *", "int", "struct vm_exit *");
  276 
  277 SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor,
  278     "struct vmx *", "int", "struct vm_exit *");
  279 
  280 SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait,
  281     "struct vmx *", "int", "struct vm_exit *");
  282 
  283 SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn,
  284     "struct vmx *", "int", "struct vm_exit *");
  285 
  286 SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown,
  287     "struct vmx *", "int", "struct vm_exit *", "uint32_t");
  288 
  289 SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
  290     "struct vmx *", "int", "struct vm_exit *", "int");
  291 
  292 /*
  293  * Use the last page below 4GB as the APIC access address. This address is
  294  * occupied by the boot firmware so it is guaranteed that it will not conflict
  295  * with a page in system memory.
  296  */
  297 #define APIC_ACCESS_ADDRESS     0xFFFFF000
  298 
  299 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
  300 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
  301 static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
  302 static void vmx_inject_pir(struct vlapic *vlapic);
  303 
  304 static inline bool
  305 host_has_rdpid(void)
  306 {
  307         return ((cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0);
  308 }
  309 
  310 static inline bool
  311 host_has_rdtscp(void)
  312 {
  313         return ((amd_feature & AMDID_RDTSCP) != 0);
  314 }
  315 
  316 #ifdef KTR
  317 static const char *
  318 exit_reason_to_str(int reason)
  319 {
  320         static char reasonbuf[32];
  321 
  322         switch (reason) {
  323         case EXIT_REASON_EXCEPTION:
  324                 return "exception";
  325         case EXIT_REASON_EXT_INTR:
  326                 return "extint";
  327         case EXIT_REASON_TRIPLE_FAULT:
  328                 return "triplefault";
  329         case EXIT_REASON_INIT:
  330                 return "init";
  331         case EXIT_REASON_SIPI:
  332                 return "sipi";
  333         case EXIT_REASON_IO_SMI:
  334                 return "iosmi";
  335         case EXIT_REASON_SMI:
  336                 return "smi";
  337         case EXIT_REASON_INTR_WINDOW:
  338                 return "intrwindow";
  339         case EXIT_REASON_NMI_WINDOW:
  340                 return "nmiwindow";
  341         case EXIT_REASON_TASK_SWITCH:
  342                 return "taskswitch";
  343         case EXIT_REASON_CPUID:
  344                 return "cpuid";
  345         case EXIT_REASON_GETSEC:
  346                 return "getsec";
  347         case EXIT_REASON_HLT:
  348                 return "hlt";
  349         case EXIT_REASON_INVD:
  350                 return "invd";
  351         case EXIT_REASON_INVLPG:
  352                 return "invlpg";
  353         case EXIT_REASON_RDPMC:
  354                 return "rdpmc";
  355         case EXIT_REASON_RDTSC:
  356                 return "rdtsc";
  357         case EXIT_REASON_RSM:
  358                 return "rsm";
  359         case EXIT_REASON_VMCALL:
  360                 return "vmcall";
  361         case EXIT_REASON_VMCLEAR:
  362                 return "vmclear";
  363         case EXIT_REASON_VMLAUNCH:
  364                 return "vmlaunch";
  365         case EXIT_REASON_VMPTRLD:
  366                 return "vmptrld";
  367         case EXIT_REASON_VMPTRST:
  368                 return "vmptrst";
  369         case EXIT_REASON_VMREAD:
  370                 return "vmread";
  371         case EXIT_REASON_VMRESUME:
  372                 return "vmresume";
  373         case EXIT_REASON_VMWRITE:
  374                 return "vmwrite";
  375         case EXIT_REASON_VMXOFF:
  376                 return "vmxoff";
  377         case EXIT_REASON_VMXON:
  378                 return "vmxon";
  379         case EXIT_REASON_CR_ACCESS:
  380                 return "craccess";
  381         case EXIT_REASON_DR_ACCESS:
  382                 return "draccess";
  383         case EXIT_REASON_INOUT:
  384                 return "inout";
  385         case EXIT_REASON_RDMSR:
  386                 return "rdmsr";
  387         case EXIT_REASON_WRMSR:
  388                 return "wrmsr";
  389         case EXIT_REASON_INVAL_VMCS:
  390                 return "invalvmcs";
  391         case EXIT_REASON_INVAL_MSR:
  392                 return "invalmsr";
  393         case EXIT_REASON_MWAIT:
  394                 return "mwait";
  395         case EXIT_REASON_MTF:
  396                 return "mtf";
  397         case EXIT_REASON_MONITOR:
  398                 return "monitor";
  399         case EXIT_REASON_PAUSE:
  400                 return "pause";
  401         case EXIT_REASON_MCE_DURING_ENTRY:
  402                 return "mce-during-entry";
  403         case EXIT_REASON_TPR:
  404                 return "tpr";
  405         case EXIT_REASON_APIC_ACCESS:
  406                 return "apic-access";
  407         case EXIT_REASON_GDTR_IDTR:
  408                 return "gdtridtr";
  409         case EXIT_REASON_LDTR_TR:
  410                 return "ldtrtr";
  411         case EXIT_REASON_EPT_FAULT:
  412                 return "eptfault";
  413         case EXIT_REASON_EPT_MISCONFIG:
  414                 return "eptmisconfig";
  415         case EXIT_REASON_INVEPT:
  416                 return "invept";
  417         case EXIT_REASON_RDTSCP:
  418                 return "rdtscp";
  419         case EXIT_REASON_VMX_PREEMPT:
  420                 return "vmxpreempt";
  421         case EXIT_REASON_INVVPID:
  422                 return "invvpid";
  423         case EXIT_REASON_WBINVD:
  424                 return "wbinvd";
  425         case EXIT_REASON_XSETBV:
  426                 return "xsetbv";
  427         case EXIT_REASON_APIC_WRITE:
  428                 return "apic-write";
  429         default:
  430                 snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
  431                 return (reasonbuf);
  432         }
  433 }
  434 #endif  /* KTR */
  435 
  436 static int
  437 vmx_allow_x2apic_msrs(struct vmx *vmx)
  438 {
  439         int i, error;
  440 
  441         error = 0;
  442 
  443         /*
  444          * Allow readonly access to the following x2APIC MSRs from the guest.
  445          */
  446         error += guest_msr_ro(vmx, MSR_APIC_ID);
  447         error += guest_msr_ro(vmx, MSR_APIC_VERSION);
  448         error += guest_msr_ro(vmx, MSR_APIC_LDR);
  449         error += guest_msr_ro(vmx, MSR_APIC_SVR);
  450 
  451         for (i = 0; i < 8; i++)
  452                 error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
  453 
  454         for (i = 0; i < 8; i++)
  455                 error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
  456 
  457         for (i = 0; i < 8; i++)
  458                 error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
  459 
  460         error += guest_msr_ro(vmx, MSR_APIC_ESR);
  461         error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
  462         error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
  463         error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
  464         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
  465         error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
  466         error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
  467         error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
  468         error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
  469         error += guest_msr_ro(vmx, MSR_APIC_ICR);
  470 
  471         /*
  472          * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
  473          *
  474          * These registers get special treatment described in the section
  475          * "Virtualizing MSR-Based APIC Accesses".
  476          */
  477         error += guest_msr_rw(vmx, MSR_APIC_TPR);
  478         error += guest_msr_rw(vmx, MSR_APIC_EOI);
  479         error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
  480 
  481         return (error);
  482 }
  483 
  484 u_long
  485 vmx_fix_cr0(u_long cr0)
  486 {
  487 
  488         return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
  489 }
  490 
  491 u_long
  492 vmx_fix_cr4(u_long cr4)
  493 {
  494 
  495         return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
  496 }
  497 
  498 static void
  499 vpid_free(int vpid)
  500 {
  501         if (vpid < 0 || vpid > 0xffff)
  502                 panic("vpid_free: invalid vpid %d", vpid);
  503 
  504         /*
  505          * VPIDs [0,VM_MAXCPU] are special and are not allocated from
  506          * the unit number allocator.
  507          */
  508 
  509         if (vpid > VM_MAXCPU)
  510                 free_unr(vpid_unr, vpid);
  511 }
  512 
  513 static void
  514 vpid_alloc(uint16_t *vpid, int num)
  515 {
  516         int i, x;
  517 
  518         if (num <= 0 || num > VM_MAXCPU)
  519                 panic("invalid number of vpids requested: %d", num);
  520 
  521         /*
  522          * If the "enable vpid" execution control is not enabled then the
  523          * VPID is required to be 0 for all vcpus.
  524          */
  525         if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
  526                 for (i = 0; i < num; i++)
  527                         vpid[i] = 0;
  528                 return;
  529         }
  530 
  531         /*
  532          * Allocate a unique VPID for each vcpu from the unit number allocator.
  533          */
  534         for (i = 0; i < num; i++) {
  535                 x = alloc_unr(vpid_unr);
  536                 if (x == -1)
  537                         break;
  538                 else
  539                         vpid[i] = x;
  540         }
  541 
  542         if (i < num) {
  543                 atomic_add_int(&vpid_alloc_failed, 1);
  544 
  545                 /*
  546                  * If the unit number allocator does not have enough unique
  547                  * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
  548                  *
  549                  * These VPIDs are not be unique across VMs but this does not
  550                  * affect correctness because the combined mappings are also
  551                  * tagged with the EP4TA which is unique for each VM.
  552                  *
  553                  * It is still sub-optimal because the invvpid will invalidate
  554                  * combined mappings for a particular VPID across all EP4TAs.
  555                  */
  556                 while (i-- > 0)
  557                         vpid_free(vpid[i]);
  558 
  559                 for (i = 0; i < num; i++)
  560                         vpid[i] = i + 1;
  561         }
  562 }
  563 
  564 static void
  565 vpid_init(void)
  566 {
  567         /*
  568          * VPID 0 is required when the "enable VPID" execution control is
  569          * disabled.
  570          *
  571          * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
  572          * unit number allocator does not have sufficient unique VPIDs to
  573          * satisfy the allocation.
  574          *
  575          * The remaining VPIDs are managed by the unit number allocator.
  576          */
  577         vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
  578 }
  579 
  580 static void
  581 vmx_disable(void *arg __unused)
  582 {
  583         struct invvpid_desc invvpid_desc = { 0 };
  584         struct invept_desc invept_desc = { 0 };
  585 
  586         if (vmxon_enabled[curcpu]) {
  587                 /*
  588                  * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
  589                  *
  590                  * VMXON or VMXOFF are not required to invalidate any TLB
  591                  * caching structures. This prevents potential retention of
  592                  * cached information in the TLB between distinct VMX episodes.
  593                  */
  594                 invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
  595                 invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
  596                 vmxoff();
  597         }
  598         load_cr4(rcr4() & ~CR4_VMXE);
  599 }
  600 
  601 static int
  602 vmx_cleanup(void)
  603 {
  604 
  605         if (pirvec >= 0)
  606                 lapic_ipi_free(pirvec);
  607 
  608         if (vpid_unr != NULL) {
  609                 delete_unrhdr(vpid_unr);
  610                 vpid_unr = NULL;
  611         }
  612 
  613         if (nmi_flush_l1d_sw == 1)
  614                 nmi_flush_l1d_sw = 0;
  615 
  616         smp_rendezvous(NULL, vmx_disable, NULL, NULL);
  617 
  618         return (0);
  619 }
  620 
  621 static void
  622 vmx_enable(void *arg __unused)
  623 {
  624         int error;
  625         uint64_t feature_control;
  626 
  627         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
  628         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
  629             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
  630                 wrmsr(MSR_IA32_FEATURE_CONTROL,
  631                     feature_control | IA32_FEATURE_CONTROL_VMX_EN |
  632                     IA32_FEATURE_CONTROL_LOCK);
  633         }
  634 
  635         load_cr4(rcr4() | CR4_VMXE);
  636 
  637         *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
  638         error = vmxon(vmxon_region[curcpu]);
  639         if (error == 0)
  640                 vmxon_enabled[curcpu] = 1;
  641 }
  642 
  643 static void
  644 vmx_restore(void)
  645 {
  646 
  647         if (vmxon_enabled[curcpu])
  648                 vmxon(vmxon_region[curcpu]);
  649 }
  650 
  651 static int
  652 vmx_init(int ipinum)
  653 {
  654         int error;
  655         uint64_t basic, fixed0, fixed1, feature_control;
  656         uint32_t tmp, procbased2_vid_bits;
  657 
  658         /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
  659         if (!(cpu_feature2 & CPUID2_VMX)) {
  660                 printf("vmx_init: processor does not support VMX operation\n");
  661                 return (ENXIO);
  662         }
  663 
  664         /*
  665          * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
  666          * are set (bits 0 and 2 respectively).
  667          */
  668         feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
  669         if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
  670             (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
  671                 printf("vmx_init: VMX operation disabled by BIOS\n");
  672                 return (ENXIO);
  673         }
  674 
  675         /*
  676          * Verify capabilities MSR_VMX_BASIC:
  677          * - bit 54 indicates support for INS/OUTS decoding
  678          */
  679         basic = rdmsr(MSR_VMX_BASIC);
  680         if ((basic & (1UL << 54)) == 0) {
  681                 printf("vmx_init: processor does not support desired basic "
  682                     "capabilities\n");
  683                 return (EINVAL);
  684         }
  685 
  686         /* Check support for primary processor-based VM-execution controls */
  687         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  688                                MSR_VMX_TRUE_PROCBASED_CTLS,
  689                                PROCBASED_CTLS_ONE_SETTING,
  690                                PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
  691         if (error) {
  692                 printf("vmx_init: processor does not support desired primary "
  693                        "processor-based controls\n");
  694                 return (error);
  695         }
  696 
  697         /* Clear the processor-based ctl bits that are set on demand */
  698         procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
  699 
  700         /* Check support for secondary processor-based VM-execution controls */
  701         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  702                                MSR_VMX_PROCBASED_CTLS2,
  703                                PROCBASED_CTLS2_ONE_SETTING,
  704                                PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
  705         if (error) {
  706                 printf("vmx_init: processor does not support desired secondary "
  707                        "processor-based controls\n");
  708                 return (error);
  709         }
  710 
  711         /* Check support for VPID */
  712         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
  713                                PROCBASED2_ENABLE_VPID, 0, &tmp);
  714         if (error == 0)
  715                 procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
  716 
  717         /* Check support for pin-based VM-execution controls */
  718         error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
  719                                MSR_VMX_TRUE_PINBASED_CTLS,
  720                                PINBASED_CTLS_ONE_SETTING,
  721                                PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
  722         if (error) {
  723                 printf("vmx_init: processor does not support desired "
  724                        "pin-based controls\n");
  725                 return (error);
  726         }
  727 
  728         /* Check support for VM-exit controls */
  729         error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
  730                                VM_EXIT_CTLS_ONE_SETTING,
  731                                VM_EXIT_CTLS_ZERO_SETTING,
  732                                &exit_ctls);
  733         if (error) {
  734                 printf("vmx_init: processor does not support desired "
  735                     "exit controls\n");
  736                 return (error);
  737         }
  738 
  739         /* Check support for VM-entry controls */
  740         error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
  741             VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
  742             &entry_ctls);
  743         if (error) {
  744                 printf("vmx_init: processor does not support desired "
  745                     "entry controls\n");
  746                 return (error);
  747         }
  748 
  749         /*
  750          * Check support for optional features by testing them
  751          * as individual bits
  752          */
  753         cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  754                                         MSR_VMX_TRUE_PROCBASED_CTLS,
  755                                         PROCBASED_HLT_EXITING, 0,
  756                                         &tmp) == 0);
  757 
  758         cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  759                                         MSR_VMX_PROCBASED_CTLS,
  760                                         PROCBASED_MTF, 0,
  761                                         &tmp) == 0);
  762 
  763         cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  764                                          MSR_VMX_TRUE_PROCBASED_CTLS,
  765                                          PROCBASED_PAUSE_EXITING, 0,
  766                                          &tmp) == 0);
  767 
  768         /*
  769          * Check support for RDPID and/or RDTSCP.
  770          *
  771          * Support a pass-through-based implementation of these via the
  772          * "enable RDTSCP" VM-execution control and the "RDTSC exiting"
  773          * VM-execution control.
  774          *
  775          * The "enable RDTSCP" VM-execution control applies to both RDPID
  776          * and RDTSCP (see SDM volume 3, section 25.3, "Changes to
  777          * Instruction Behavior in VMX Non-root operation"); this is why
  778          * only this VM-execution control needs to be enabled in order to
  779          * enable passing through whichever of RDPID and/or RDTSCP are
  780          * supported by the host.
  781          *
  782          * The "RDTSC exiting" VM-execution control applies to both RDTSC
  783          * and RDTSCP (again, per SDM volume 3, section 25.3), and is
  784          * already set up for RDTSC and RDTSCP pass-through by the current
  785          * implementation of RDTSC.
  786          *
  787          * Although RDPID and RDTSCP are optional capabilities, since there
  788          * does not currently seem to be a use case for enabling/disabling
  789          * these via libvmmapi, choose not to support this and, instead,
  790          * just statically always enable or always disable this support
  791          * across all vCPUs on all VMs. (Note that there may be some
  792          * complications to providing this functionality, e.g., the MSR
  793          * bitmap is currently per-VM rather than per-vCPU while the
  794          * capability API wants to be able to control capabilities on a
  795          * per-vCPU basis).
  796          */
  797         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  798                                MSR_VMX_PROCBASED_CTLS2,
  799                                PROCBASED2_ENABLE_RDTSCP, 0, &tmp);
  800         cap_rdpid = error == 0 && host_has_rdpid();
  801         cap_rdtscp = error == 0 && host_has_rdtscp();
  802         if (cap_rdpid || cap_rdtscp)
  803                 procbased_ctls2 |= PROCBASED2_ENABLE_RDTSCP;
  804 
  805         cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  806                                         MSR_VMX_PROCBASED_CTLS2,
  807                                         PROCBASED2_UNRESTRICTED_GUEST, 0,
  808                                         &tmp) == 0);
  809 
  810         cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
  811             MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0,
  812             &tmp) == 0);
  813 
  814         /*
  815          * Check support for TPR shadow.
  816          */
  817         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
  818             MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0,
  819             &tmp);
  820         if (error == 0) {
  821                 tpr_shadowing = 1;
  822                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_tpr_shadowing",
  823                     &tpr_shadowing);
  824         }
  825 
  826         if (tpr_shadowing) {
  827                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
  828                 procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING;
  829                 procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING;
  830         }
  831 
  832         /*
  833          * Check support for virtual interrupt delivery.
  834          */
  835         procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES |
  836             PROCBASED2_VIRTUALIZE_X2APIC_MODE |
  837             PROCBASED2_APIC_REGISTER_VIRTUALIZATION |
  838             PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY);
  839 
  840         error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
  841             procbased2_vid_bits, 0, &tmp);
  842         if (error == 0 && tpr_shadowing) {
  843                 virtual_interrupt_delivery = 1;
  844                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid",
  845                     &virtual_interrupt_delivery);
  846         }
  847 
  848         if (virtual_interrupt_delivery) {
  849                 procbased_ctls |= PROCBASED_USE_TPR_SHADOW;
  850                 procbased_ctls2 |= procbased2_vid_bits;
  851                 procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE;
  852 
  853                 /*
  854                  * Check for Posted Interrupts only if Virtual Interrupt
  855                  * Delivery is enabled.
  856                  */
  857                 error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
  858                     MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0,
  859                     &tmp);
  860                 if (error == 0) {
  861                         pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
  862                             &IDTVEC(justreturn));
  863                         if (pirvec < 0) {
  864                                 if (bootverbose) {
  865                                         printf("vmx_init: unable to allocate "
  866                                             "posted interrupt vector\n");
  867                                 }
  868                         } else {
  869                                 posted_interrupts = 1;
  870                                 TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir",
  871                                     &posted_interrupts);
  872                         }
  873                 }
  874         }
  875 
  876         if (posted_interrupts)
  877                     pinbased_ctls |= PINBASED_POSTED_INTERRUPT;
  878 
  879         /* Initialize EPT */
  880         error = ept_init(ipinum);
  881         if (error) {
  882                 printf("vmx_init: ept initialization failed (%d)\n", error);
  883                 return (error);
  884         }
  885 
  886         guest_l1d_flush = (cpu_ia32_arch_caps &
  887             IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0;
  888         TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
  889 
  890         /*
  891          * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
  892          * available.  Otherwise fall back to the software flush
  893          * method which loads enough data from the kernel text to
  894          * flush existing L1D content, both on VMX entry and on NMI
  895          * return.
  896          */
  897         if (guest_l1d_flush) {
  898                 if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
  899                         guest_l1d_flush_sw = 1;
  900                         TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
  901                             &guest_l1d_flush_sw);
  902                 }
  903                 if (guest_l1d_flush_sw) {
  904                         if (nmi_flush_l1d_sw <= 1)
  905                                 nmi_flush_l1d_sw = 1;
  906                 } else {
  907                         msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
  908                         msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
  909                 }
  910         }
  911 
  912         /*
  913          * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
  914          */
  915         fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
  916         fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
  917         cr0_ones_mask = fixed0 & fixed1;
  918         cr0_zeros_mask = ~fixed0 & ~fixed1;
  919 
  920         /*
  921          * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
  922          * if unrestricted guest execution is allowed.
  923          */
  924         if (cap_unrestricted_guest)
  925                 cr0_ones_mask &= ~(CR0_PG | CR0_PE);
  926 
  927         /*
  928          * Do not allow the guest to set CR0_NW or CR0_CD.
  929          */
  930         cr0_zeros_mask |= (CR0_NW | CR0_CD);
  931 
  932         fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
  933         fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
  934         cr4_ones_mask = fixed0 & fixed1;
  935         cr4_zeros_mask = ~fixed0 & ~fixed1;
  936 
  937         vpid_init();
  938 
  939         vmx_msr_init();
  940 
  941         /* enable VMX operation */
  942         smp_rendezvous(NULL, vmx_enable, NULL, NULL);
  943 
  944         vmx_initialized = 1;
  945 
  946         return (0);
  947 }
  948 
  949 static void
  950 vmx_trigger_hostintr(int vector)
  951 {
  952         uintptr_t func;
  953         struct gate_descriptor *gd;
  954 
  955         gd = &idt[vector];
  956 
  957         KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: "
  958             "invalid vector %d", vector));
  959         KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present",
  960             vector));
  961         KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d "
  962             "has invalid type %d", vector, gd->gd_type));
  963         KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d "
  964             "has invalid dpl %d", vector, gd->gd_dpl));
  965         KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor "
  966             "for vector %d has invalid selector %d", vector, gd->gd_selector));
  967         KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid "
  968             "IST %d", vector, gd->gd_ist));
  969 
  970         func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset);
  971         vmx_call_isr(func);
  972 }
  973 
  974 static int
  975 vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
  976 {
  977         int error, mask_ident, shadow_ident;
  978         uint64_t mask_value;
  979 
  980         if (which != 0 && which != 4)
  981                 panic("vmx_setup_cr_shadow: unknown cr%d", which);
  982 
  983         if (which == 0) {
  984                 mask_ident = VMCS_CR0_MASK;
  985                 mask_value = cr0_ones_mask | cr0_zeros_mask;
  986                 shadow_ident = VMCS_CR0_SHADOW;
  987         } else {
  988                 mask_ident = VMCS_CR4_MASK;
  989                 mask_value = cr4_ones_mask | cr4_zeros_mask;
  990                 shadow_ident = VMCS_CR4_SHADOW;
  991         }
  992 
  993         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
  994         if (error)
  995                 return (error);
  996 
  997         error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
  998         if (error)
  999                 return (error);
 1000 
 1001         return (0);
 1002 }
 1003 #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
 1004 #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
 1005 
 1006 static void *
 1007 vmx_vminit(struct vm *vm, pmap_t pmap)
 1008 {
 1009         uint16_t vpid[VM_MAXCPU];
 1010         int i, error;
 1011         struct vmx *vmx;
 1012         struct vmcs *vmcs;
 1013         uint32_t exc_bitmap;
 1014         uint16_t maxcpus;
 1015 
 1016         vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
 1017         if ((uintptr_t)vmx & PAGE_MASK) {
 1018                 panic("malloc of struct vmx not aligned on %d byte boundary",
 1019                       PAGE_SIZE);
 1020         }
 1021         vmx->vm = vm;
 1022 
 1023         vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
 1024 
 1025         /*
 1026          * Clean up EPTP-tagged guest physical and combined mappings
 1027          *
 1028          * VMX transitions are not required to invalidate any guest physical
 1029          * mappings. So, it may be possible for stale guest physical mappings
 1030          * to be present in the processor TLBs.
 1031          *
 1032          * Combined mappings for this EP4TA are also invalidated for all VPIDs.
 1033          */
 1034         ept_invalidate_mappings(vmx->eptp);
 1035 
 1036         msr_bitmap_initialize(vmx->msr_bitmap);
 1037 
 1038         /*
 1039          * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
 1040          * The guest FSBASE and GSBASE are saved and restored during
 1041          * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
 1042          * always restored from the vmcs host state area on vm-exit.
 1043          *
 1044          * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
 1045          * how they are saved/restored so can be directly accessed by the
 1046          * guest.
 1047          *
 1048          * MSR_EFER is saved and restored in the guest VMCS area on a
 1049          * VM exit and entry respectively. It is also restored from the
 1050          * host VMCS area on a VM exit.
 1051          *
 1052          * The TSC MSR is exposed read-only. Writes are disallowed as
 1053          * that will impact the host TSC.  If the guest does a write
 1054          * the "use TSC offsetting" execution control is enabled and the
 1055          * difference between the host TSC and the guest TSC is written
 1056          * into the TSC offset in the VMCS.
 1057          *
 1058          * Guest TSC_AUX support is enabled if any of guest RDPID and/or
 1059          * guest RDTSCP support are enabled (since, as per Table 2-2 in SDM
 1060          * volume 4, TSC_AUX is supported if any of RDPID and/or RDTSCP are
 1061          * supported). If guest TSC_AUX support is enabled, TSC_AUX is
 1062          * exposed read-only so that the VMM can do one fewer MSR read per
 1063          * exit than if this register were exposed read-write; the guest
 1064          * restore value can be updated during guest writes (expected to be
 1065          * rare) instead of during all exits (common).
 1066          */
 1067         if (guest_msr_rw(vmx, MSR_GSBASE) ||
 1068             guest_msr_rw(vmx, MSR_FSBASE) ||
 1069             guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
 1070             guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
 1071             guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
 1072             guest_msr_rw(vmx, MSR_EFER) ||
 1073             guest_msr_ro(vmx, MSR_TSC) ||
 1074             ((cap_rdpid || cap_rdtscp) && guest_msr_ro(vmx, MSR_TSC_AUX)))
 1075                 panic("vmx_vminit: error setting guest msr access");
 1076 
 1077         vpid_alloc(vpid, VM_MAXCPU);
 1078 
 1079         if (virtual_interrupt_delivery) {
 1080                 error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE,
 1081                     APIC_ACCESS_ADDRESS);
 1082                 /* XXX this should really return an error to the caller */
 1083                 KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error));
 1084         }
 1085 
 1086         maxcpus = vm_get_maxcpus(vm);
 1087         for (i = 0; i < maxcpus; i++) {
 1088                 vmcs = &vmx->vmcs[i];
 1089                 vmcs->identifier = vmx_revision();
 1090                 error = vmclear(vmcs);
 1091                 if (error != 0) {
 1092                         panic("vmx_vminit: vmclear error %d on vcpu %d\n",
 1093                               error, i);
 1094                 }
 1095 
 1096                 vmx_msr_guest_init(vmx, i);
 1097 
 1098                 error = vmcs_init(vmcs);
 1099                 KASSERT(error == 0, ("vmcs_init error %d", error));
 1100 
 1101                 VMPTRLD(vmcs);
 1102                 error = 0;
 1103                 error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]);
 1104                 error += vmwrite(VMCS_EPTP, vmx->eptp);
 1105                 error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls);
 1106                 error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls);
 1107                 error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2);
 1108                 error += vmwrite(VMCS_EXIT_CTLS, exit_ctls);
 1109                 error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 1110                 error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 1111                 error += vmwrite(VMCS_VPID, vpid[i]);
 1112 
 1113                 if (guest_l1d_flush && !guest_l1d_flush_sw) {
 1114                         vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
 1115                             (vm_offset_t)&msr_load_list[0]));
 1116                         vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
 1117                             nitems(msr_load_list));
 1118                         vmcs_write(VMCS_EXIT_MSR_STORE, 0);
 1119                         vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
 1120                 }
 1121 
 1122                 /* exception bitmap */
 1123                 if (vcpu_trace_exceptions(vm, i))
 1124                         exc_bitmap = 0xffffffff;
 1125                 else
 1126                         exc_bitmap = 1 << IDT_MC;
 1127                 error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap);
 1128 
 1129                 vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1;
 1130                 error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1);
 1131 
 1132                 if (tpr_shadowing) {
 1133                         error += vmwrite(VMCS_VIRTUAL_APIC,
 1134                             vtophys(&vmx->apic_page[i]));
 1135                 }
 1136 
 1137                 if (virtual_interrupt_delivery) {
 1138                         error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS);
 1139                         error += vmwrite(VMCS_EOI_EXIT0, 0);
 1140                         error += vmwrite(VMCS_EOI_EXIT1, 0);
 1141                         error += vmwrite(VMCS_EOI_EXIT2, 0);
 1142                         error += vmwrite(VMCS_EOI_EXIT3, 0);
 1143                 }
 1144                 if (posted_interrupts) {
 1145                         error += vmwrite(VMCS_PIR_VECTOR, pirvec);
 1146                         error += vmwrite(VMCS_PIR_DESC,
 1147                             vtophys(&vmx->pir_desc[i]));
 1148                 }
 1149                 VMCLEAR(vmcs);
 1150                 KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs"));
 1151 
 1152                 vmx->cap[i].set = 0;
 1153                 vmx->cap[i].set |= cap_rdpid != 0 ? 1 << VM_CAP_RDPID : 0;
 1154                 vmx->cap[i].set |= cap_rdtscp != 0 ? 1 << VM_CAP_RDTSCP : 0;
 1155                 vmx->cap[i].proc_ctls = procbased_ctls;
 1156                 vmx->cap[i].proc_ctls2 = procbased_ctls2;
 1157                 vmx->cap[i].exc_bitmap = exc_bitmap;
 1158 
 1159                 vmx->state[i].nextrip = ~0;
 1160                 vmx->state[i].lastcpu = NOCPU;
 1161                 vmx->state[i].vpid = vpid[i];
 1162 
 1163                 /*
 1164                  * Set up the CR0/4 shadows, and init the read shadow
 1165                  * to the power-on register value from the Intel Sys Arch.
 1166                  *  CR0 - 0x60000010
 1167                  *  CR4 - 0
 1168                  */
 1169                 error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
 1170                 if (error != 0)
 1171                         panic("vmx_setup_cr0_shadow %d", error);
 1172 
 1173                 error = vmx_setup_cr4_shadow(vmcs, 0);
 1174                 if (error != 0)
 1175                         panic("vmx_setup_cr4_shadow %d", error);
 1176 
 1177                 vmx->ctx[i].pmap = pmap;
 1178         }
 1179 
 1180         return (vmx);
 1181 }
 1182 
 1183 static int
 1184 vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
 1185 {
 1186         int handled, func;
 1187 
 1188         func = vmxctx->guest_rax;
 1189 
 1190         handled = x86_emulate_cpuid(vm, vcpu,
 1191                                     (uint32_t*)(&vmxctx->guest_rax),
 1192                                     (uint32_t*)(&vmxctx->guest_rbx),
 1193                                     (uint32_t*)(&vmxctx->guest_rcx),
 1194                                     (uint32_t*)(&vmxctx->guest_rdx));
 1195         return (handled);
 1196 }
 1197 
 1198 static __inline void
 1199 vmx_run_trace(struct vmx *vmx, int vcpu)
 1200 {
 1201 #ifdef KTR
 1202         VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
 1203 #endif
 1204 }
 1205 
 1206 static __inline void
 1207 vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
 1208                int handled)
 1209 {
 1210 #ifdef KTR
 1211         VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
 1212                  handled ? "handled" : "unhandled",
 1213                  exit_reason_to_str(exit_reason), rip);
 1214 #endif
 1215 }
 1216 
 1217 static __inline void
 1218 vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
 1219 {
 1220 #ifdef KTR
 1221         VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
 1222 #endif
 1223 }
 1224 
 1225 static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved");
 1226 static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done");
 1227 
 1228 /*
 1229  * Invalidate guest mappings identified by its vpid from the TLB.
 1230  */
 1231 static __inline void
 1232 vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running)
 1233 {
 1234         struct vmxstate *vmxstate;
 1235         struct invvpid_desc invvpid_desc;
 1236 
 1237         vmxstate = &vmx->state[vcpu];
 1238         if (vmxstate->vpid == 0)
 1239                 return;
 1240 
 1241         if (!running) {
 1242                 /*
 1243                  * Set the 'lastcpu' to an invalid host cpu.
 1244                  *
 1245                  * This will invalidate TLB entries tagged with the vcpu's
 1246                  * vpid the next time it runs via vmx_set_pcpu_defaults().
 1247                  */
 1248                 vmxstate->lastcpu = NOCPU;
 1249                 return;
 1250         }
 1251 
 1252         KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside "
 1253             "critical section", __func__, vcpu));
 1254 
 1255         /*
 1256          * Invalidate all mappings tagged with 'vpid'
 1257          *
 1258          * We do this because this vcpu was executing on a different host
 1259          * cpu when it last ran. We do not track whether it invalidated
 1260          * mappings associated with its 'vpid' during that run. So we must
 1261          * assume that the mappings associated with 'vpid' on 'curcpu' are
 1262          * stale and invalidate them.
 1263          *
 1264          * Note that we incur this penalty only when the scheduler chooses to
 1265          * move the thread associated with this vcpu between host cpus.
 1266          *
 1267          * Note also that this will invalidate mappings tagged with 'vpid'
 1268          * for "all" EP4TAs.
 1269          */
 1270         if (pmap->pm_eptgen == vmx->eptgen[curcpu]) {
 1271                 invvpid_desc._res1 = 0;
 1272                 invvpid_desc._res2 = 0;
 1273                 invvpid_desc.vpid = vmxstate->vpid;
 1274                 invvpid_desc.linear_addr = 0;
 1275                 invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
 1276                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1);
 1277         } else {
 1278                 /*
 1279                  * The invvpid can be skipped if an invept is going to
 1280                  * be performed before entering the guest. The invept
 1281                  * will invalidate combined mappings tagged with
 1282                  * 'vmx->eptp' for all vpids.
 1283                  */
 1284                 vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1);
 1285         }
 1286 }
 1287 
 1288 static void
 1289 vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap)
 1290 {
 1291         struct vmxstate *vmxstate;
 1292 
 1293         vmxstate = &vmx->state[vcpu];
 1294         if (vmxstate->lastcpu == curcpu)
 1295                 return;
 1296 
 1297         vmxstate->lastcpu = curcpu;
 1298 
 1299         vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
 1300 
 1301         vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
 1302         vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
 1303         vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
 1304         vmx_invvpid(vmx, vcpu, pmap, 1);
 1305 }
 1306 
 1307 /*
 1308  * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
 1309  */
 1310 CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
 1311 
 1312 static void __inline
 1313 vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
 1314 {
 1315 
 1316         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
 1317                 vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
 1318                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1319                 VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
 1320         }
 1321 }
 1322 
 1323 static void __inline
 1324 vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 1325 {
 1326 
 1327         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
 1328             ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
 1329         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
 1330         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1331         VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 1332 }
 1333 
 1334 static void __inline
 1335 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 1336 {
 1337 
 1338         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
 1339                 vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 1340                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1341                 VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 1342         }
 1343 }
 1344 
 1345 static void __inline
 1346 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 1347 {
 1348 
 1349         KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
 1350             ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
 1351         vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 1352         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1353         VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 1354 }
 1355 
 1356 int
 1357 vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
 1358 {
 1359         int error;
 1360 
 1361         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
 1362                 vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
 1363                 vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
 1364                 VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
 1365         }
 1366 
 1367         error = vmwrite(VMCS_TSC_OFFSET, offset);
 1368 
 1369         return (error);
 1370 }
 1371 
 1372 #define NMI_BLOCKING    (VMCS_INTERRUPTIBILITY_NMI_BLOCKING |           \
 1373                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 1374 #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING |           \
 1375                          VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 1376 
 1377 static void
 1378 vmx_inject_nmi(struct vmx *vmx, int vcpu)
 1379 {
 1380         uint32_t gi, info;
 1381 
 1382         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1383         KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
 1384             "interruptibility-state %#x", gi));
 1385 
 1386         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1387         KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
 1388             "VM-entry interruption information %#x", info));
 1389 
 1390         /*
 1391          * Inject the virtual NMI. The vector must be the NMI IDT entry
 1392          * or the VMCS entry check will fail.
 1393          */
 1394         info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
 1395         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1396 
 1397         VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
 1398 
 1399         /* Clear the request */
 1400         vm_nmi_clear(vmx->vm, vcpu);
 1401 }
 1402 
 1403 static void
 1404 vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
 1405     uint64_t guestrip)
 1406 {
 1407         int vector, need_nmi_exiting, extint_pending;
 1408         uint64_t rflags, entryinfo;
 1409         uint32_t gi, info;
 1410 
 1411         if (vmx->state[vcpu].nextrip != guestrip) {
 1412                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1413                 if (gi & HWINTR_BLOCKING) {
 1414                         VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
 1415                             "cleared due to rip change: %#lx/%#lx",
 1416                             vmx->state[vcpu].nextrip, guestrip);
 1417                         gi &= ~HWINTR_BLOCKING;
 1418                         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1419                 }
 1420         }
 1421 
 1422         if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
 1423                 KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
 1424                     "intinfo is not valid: %#lx", __func__, entryinfo));
 1425 
 1426                 info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1427                 KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
 1428                      "pending exception: %#lx/%#x", __func__, entryinfo, info));
 1429 
 1430                 info = entryinfo;
 1431                 vector = info & 0xff;
 1432                 if (vector == IDT_BP || vector == IDT_OF) {
 1433                         /*
 1434                          * VT-x requires #BP and #OF to be injected as software
 1435                          * exceptions.
 1436                          */
 1437                         info &= ~VMCS_INTR_T_MASK;
 1438                         info |= VMCS_INTR_T_SWEXCEPTION;
 1439                 }
 1440 
 1441                 if (info & VMCS_INTR_DEL_ERRCODE)
 1442                         vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
 1443 
 1444                 vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1445         }
 1446 
 1447         if (vm_nmi_pending(vmx->vm, vcpu)) {
 1448                 /*
 1449                  * If there are no conditions blocking NMI injection then
 1450                  * inject it directly here otherwise enable "NMI window
 1451                  * exiting" to inject it as soon as we can.
 1452                  *
 1453                  * We also check for STI_BLOCKING because some implementations
 1454                  * don't allow NMI injection in this case. If we are running
 1455                  * on a processor that doesn't have this restriction it will
 1456                  * immediately exit and the NMI will be injected in the
 1457                  * "NMI window exiting" handler.
 1458                  */
 1459                 need_nmi_exiting = 1;
 1460                 gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1461                 if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
 1462                         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1463                         if ((info & VMCS_INTR_VALID) == 0) {
 1464                                 vmx_inject_nmi(vmx, vcpu);
 1465                                 need_nmi_exiting = 0;
 1466                         } else {
 1467                                 VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
 1468                                     "due to VM-entry intr info %#x", info);
 1469                         }
 1470                 } else {
 1471                         VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
 1472                             "Guest Interruptibility-state %#x", gi);
 1473                 }
 1474 
 1475                 if (need_nmi_exiting)
 1476                         vmx_set_nmi_window_exiting(vmx, vcpu);
 1477         }
 1478 
 1479         extint_pending = vm_extint_pending(vmx->vm, vcpu);
 1480 
 1481         if (!extint_pending && virtual_interrupt_delivery) {
 1482                 vmx_inject_pir(vlapic);
 1483                 return;
 1484         }
 1485 
 1486         /*
 1487          * If interrupt-window exiting is already in effect then don't bother
 1488          * checking for pending interrupts. This is just an optimization and
 1489          * not needed for correctness.
 1490          */
 1491         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
 1492                 VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
 1493                     "pending int_window_exiting");
 1494                 return;
 1495         }
 1496 
 1497         if (!extint_pending) {
 1498                 /* Ask the local apic for a vector to inject */
 1499                 if (!vlapic_pending_intr(vlapic, &vector))
 1500                         return;
 1501 
 1502                 /*
 1503                  * From the Intel SDM, Volume 3, Section "Maskable
 1504                  * Hardware Interrupts":
 1505                  * - maskable interrupt vectors [16,255] can be delivered
 1506                  *   through the local APIC.
 1507                 */
 1508                 KASSERT(vector >= 16 && vector <= 255,
 1509                     ("invalid vector %d from local APIC", vector));
 1510         } else {
 1511                 /* Ask the legacy pic for a vector to inject */
 1512                 vatpic_pending_intr(vmx->vm, &vector);
 1513 
 1514                 /*
 1515                  * From the Intel SDM, Volume 3, Section "Maskable
 1516                  * Hardware Interrupts":
 1517                  * - maskable interrupt vectors [0,255] can be delivered
 1518                  *   through the INTR pin.
 1519                  */
 1520                 KASSERT(vector >= 0 && vector <= 255,
 1521                     ("invalid vector %d from INTR", vector));
 1522         }
 1523 
 1524         /* Check RFLAGS.IF and the interruptibility state of the guest */
 1525         rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 1526         if ((rflags & PSL_I) == 0) {
 1527                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1528                     "rflags %#lx", vector, rflags);
 1529                 goto cantinject;
 1530         }
 1531 
 1532         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1533         if (gi & HWINTR_BLOCKING) {
 1534                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1535                     "Guest Interruptibility-state %#x", vector, gi);
 1536                 goto cantinject;
 1537         }
 1538 
 1539         info = vmcs_read(VMCS_ENTRY_INTR_INFO);
 1540         if (info & VMCS_INTR_VALID) {
 1541                 /*
 1542                  * This is expected and could happen for multiple reasons:
 1543                  * - A vectoring VM-entry was aborted due to astpending
 1544                  * - A VM-exit happened during event injection.
 1545                  * - An exception was injected above.
 1546                  * - An NMI was injected above or after "NMI window exiting"
 1547                  */
 1548                 VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
 1549                     "VM-entry intr info %#x", vector, info);
 1550                 goto cantinject;
 1551         }
 1552 
 1553         /* Inject the interrupt */
 1554         info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
 1555         info |= vector;
 1556         vmcs_write(VMCS_ENTRY_INTR_INFO, info);
 1557 
 1558         if (!extint_pending) {
 1559                 /* Update the Local APIC ISR */
 1560                 vlapic_intr_accepted(vlapic, vector);
 1561         } else {
 1562                 vm_extint_clear(vmx->vm, vcpu);
 1563                 vatpic_intr_accepted(vmx->vm, vector);
 1564 
 1565                 /*
 1566                  * After we accepted the current ExtINT the PIC may
 1567                  * have posted another one.  If that is the case, set
 1568                  * the Interrupt Window Exiting execution control so
 1569                  * we can inject that one too.
 1570                  *
 1571                  * Also, interrupt window exiting allows us to inject any
 1572                  * pending APIC vector that was preempted by the ExtINT
 1573                  * as soon as possible. This applies both for the software
 1574                  * emulated vlapic and the hardware assisted virtual APIC.
 1575                  */
 1576                 vmx_set_int_window_exiting(vmx, vcpu);
 1577         }
 1578 
 1579         VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
 1580 
 1581         return;
 1582 
 1583 cantinject:
 1584         /*
 1585          * Set the Interrupt Window Exiting execution control so we can inject
 1586          * the interrupt as soon as blocking condition goes away.
 1587          */
 1588         vmx_set_int_window_exiting(vmx, vcpu);
 1589 }
 1590 
 1591 /*
 1592  * If the Virtual NMIs execution control is '1' then the logical processor
 1593  * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
 1594  * the VMCS. An IRET instruction in VMX non-root operation will remove any
 1595  * virtual-NMI blocking.
 1596  *
 1597  * This unblocking occurs even if the IRET causes a fault. In this case the
 1598  * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
 1599  */
 1600 static void
 1601 vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
 1602 {
 1603         uint32_t gi;
 1604 
 1605         VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
 1606         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1607         gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 1608         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1609 }
 1610 
 1611 static void
 1612 vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
 1613 {
 1614         uint32_t gi;
 1615 
 1616         VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
 1617         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1618         gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
 1619         vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 1620 }
 1621 
 1622 static void
 1623 vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid)
 1624 {
 1625         uint32_t gi;
 1626 
 1627         gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 1628         KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING,
 1629             ("NMI blocking is not in effect %#x", gi));
 1630 }
 1631 
 1632 static int
 1633 vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 1634 {
 1635         struct vmxctx *vmxctx;
 1636         uint64_t xcrval;
 1637         const struct xsave_limits *limits;
 1638 
 1639         vmxctx = &vmx->ctx[vcpu];
 1640         limits = vmm_get_xsave_limits();
 1641 
 1642         /*
 1643          * Note that the processor raises a GP# fault on its own if
 1644          * xsetbv is executed for CPL != 0, so we do not have to
 1645          * emulate that fault here.
 1646          */
 1647 
 1648         /* Only xcr0 is supported. */
 1649         if (vmxctx->guest_rcx != 0) {
 1650                 vm_inject_gp(vmx->vm, vcpu);
 1651                 return (HANDLED);
 1652         }
 1653 
 1654         /* We only handle xcr0 if both the host and guest have XSAVE enabled. */
 1655         if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) {
 1656                 vm_inject_ud(vmx->vm, vcpu);
 1657                 return (HANDLED);
 1658         }
 1659 
 1660         xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff);
 1661         if ((xcrval & ~limits->xcr0_allowed) != 0) {
 1662                 vm_inject_gp(vmx->vm, vcpu);
 1663                 return (HANDLED);
 1664         }
 1665 
 1666         if (!(xcrval & XFEATURE_ENABLED_X87)) {
 1667                 vm_inject_gp(vmx->vm, vcpu);
 1668                 return (HANDLED);
 1669         }
 1670 
 1671         /* AVX (YMM_Hi128) requires SSE. */
 1672         if (xcrval & XFEATURE_ENABLED_AVX &&
 1673             (xcrval & XFEATURE_AVX) != XFEATURE_AVX) {
 1674                 vm_inject_gp(vmx->vm, vcpu);
 1675                 return (HANDLED);
 1676         }
 1677 
 1678         /*
 1679          * AVX512 requires base AVX (YMM_Hi128) as well as OpMask,
 1680          * ZMM_Hi256, and Hi16_ZMM.
 1681          */
 1682         if (xcrval & XFEATURE_AVX512 &&
 1683             (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) !=
 1684             (XFEATURE_AVX512 | XFEATURE_AVX)) {
 1685                 vm_inject_gp(vmx->vm, vcpu);
 1686                 return (HANDLED);
 1687         }
 1688 
 1689         /*
 1690          * Intel MPX requires both bound register state flags to be
 1691          * set.
 1692          */
 1693         if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) !=
 1694             ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) {
 1695                 vm_inject_gp(vmx->vm, vcpu);
 1696                 return (HANDLED);
 1697         }
 1698 
 1699         /*
 1700          * This runs "inside" vmrun() with the guest's FPU state, so
 1701          * modifying xcr0 directly modifies the guest's xcr0, not the
 1702          * host's.
 1703          */
 1704         load_xcr(0, xcrval);
 1705         return (HANDLED);
 1706 }
 1707 
 1708 static uint64_t
 1709 vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
 1710 {
 1711         const struct vmxctx *vmxctx;
 1712 
 1713         vmxctx = &vmx->ctx[vcpu];
 1714 
 1715         switch (ident) {
 1716         case 0:
 1717                 return (vmxctx->guest_rax);
 1718         case 1:
 1719                 return (vmxctx->guest_rcx);
 1720         case 2:
 1721                 return (vmxctx->guest_rdx);
 1722         case 3:
 1723                 return (vmxctx->guest_rbx);
 1724         case 4:
 1725                 return (vmcs_read(VMCS_GUEST_RSP));
 1726         case 5:
 1727                 return (vmxctx->guest_rbp);
 1728         case 6:
 1729                 return (vmxctx->guest_rsi);
 1730         case 7:
 1731                 return (vmxctx->guest_rdi);
 1732         case 8:
 1733                 return (vmxctx->guest_r8);
 1734         case 9:
 1735                 return (vmxctx->guest_r9);
 1736         case 10:
 1737                 return (vmxctx->guest_r10);
 1738         case 11:
 1739                 return (vmxctx->guest_r11);
 1740         case 12:
 1741                 return (vmxctx->guest_r12);
 1742         case 13:
 1743                 return (vmxctx->guest_r13);
 1744         case 14:
 1745                 return (vmxctx->guest_r14);
 1746         case 15:
 1747                 return (vmxctx->guest_r15);
 1748         default:
 1749                 panic("invalid vmx register %d", ident);
 1750         }
 1751 }
 1752 
 1753 static void
 1754 vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
 1755 {
 1756         struct vmxctx *vmxctx;
 1757 
 1758         vmxctx = &vmx->ctx[vcpu];
 1759 
 1760         switch (ident) {
 1761         case 0:
 1762                 vmxctx->guest_rax = regval;
 1763                 break;
 1764         case 1:
 1765                 vmxctx->guest_rcx = regval;
 1766                 break;
 1767         case 2:
 1768                 vmxctx->guest_rdx = regval;
 1769                 break;
 1770         case 3:
 1771                 vmxctx->guest_rbx = regval;
 1772                 break;
 1773         case 4:
 1774                 vmcs_write(VMCS_GUEST_RSP, regval);
 1775                 break;
 1776         case 5:
 1777                 vmxctx->guest_rbp = regval;
 1778                 break;
 1779         case 6:
 1780                 vmxctx->guest_rsi = regval;
 1781                 break;
 1782         case 7:
 1783                 vmxctx->guest_rdi = regval;
 1784                 break;
 1785         case 8:
 1786                 vmxctx->guest_r8 = regval;
 1787                 break;
 1788         case 9:
 1789                 vmxctx->guest_r9 = regval;
 1790                 break;
 1791         case 10:
 1792                 vmxctx->guest_r10 = regval;
 1793                 break;
 1794         case 11:
 1795                 vmxctx->guest_r11 = regval;
 1796                 break;
 1797         case 12:
 1798                 vmxctx->guest_r12 = regval;
 1799                 break;
 1800         case 13:
 1801                 vmxctx->guest_r13 = regval;
 1802                 break;
 1803         case 14:
 1804                 vmxctx->guest_r14 = regval;
 1805                 break;
 1806         case 15:
 1807                 vmxctx->guest_r15 = regval;
 1808                 break;
 1809         default:
 1810                 panic("invalid vmx register %d", ident);
 1811         }
 1812 }
 1813 
 1814 static int
 1815 vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1816 {
 1817         uint64_t crval, regval;
 1818 
 1819         /* We only handle mov to %cr0 at this time */
 1820         if ((exitqual & 0xf0) != 0x00)
 1821                 return (UNHANDLED);
 1822 
 1823         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 1824 
 1825         vmcs_write(VMCS_CR0_SHADOW, regval);
 1826 
 1827         crval = regval | cr0_ones_mask;
 1828         crval &= ~cr0_zeros_mask;
 1829         vmcs_write(VMCS_GUEST_CR0, crval);
 1830 
 1831         if (regval & CR0_PG) {
 1832                 uint64_t efer, entry_ctls;
 1833 
 1834                 /*
 1835                  * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
 1836                  * the "IA-32e mode guest" bit in VM-entry control must be
 1837                  * equal.
 1838                  */
 1839                 efer = vmcs_read(VMCS_GUEST_IA32_EFER);
 1840                 if (efer & EFER_LME) {
 1841                         efer |= EFER_LMA;
 1842                         vmcs_write(VMCS_GUEST_IA32_EFER, efer);
 1843                         entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
 1844                         entry_ctls |= VM_ENTRY_GUEST_LMA;
 1845                         vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
 1846                 }
 1847         }
 1848 
 1849         return (HANDLED);
 1850 }
 1851 
 1852 static int
 1853 vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1854 {
 1855         uint64_t crval, regval;
 1856 
 1857         /* We only handle mov to %cr4 at this time */
 1858         if ((exitqual & 0xf0) != 0x00)
 1859                 return (UNHANDLED);
 1860 
 1861         regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
 1862 
 1863         vmcs_write(VMCS_CR4_SHADOW, regval);
 1864 
 1865         crval = regval | cr4_ones_mask;
 1866         crval &= ~cr4_zeros_mask;
 1867         vmcs_write(VMCS_GUEST_CR4, crval);
 1868 
 1869         return (HANDLED);
 1870 }
 1871 
 1872 static int
 1873 vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
 1874 {
 1875         struct vlapic *vlapic;
 1876         uint64_t cr8;
 1877         int regnum;
 1878 
 1879         /* We only handle mov %cr8 to/from a register at this time. */
 1880         if ((exitqual & 0xe0) != 0x00) {
 1881                 return (UNHANDLED);
 1882         }
 1883 
 1884         vlapic = vm_lapic(vmx->vm, vcpu);
 1885         regnum = (exitqual >> 8) & 0xf;
 1886         if (exitqual & 0x10) {
 1887                 cr8 = vlapic_get_cr8(vlapic);
 1888                 vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
 1889         } else {
 1890                 cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
 1891                 vlapic_set_cr8(vlapic, cr8);
 1892         }
 1893 
 1894         return (HANDLED);
 1895 }
 1896 
 1897 /*
 1898  * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
 1899  */
 1900 static int
 1901 vmx_cpl(void)
 1902 {
 1903         uint32_t ssar;
 1904 
 1905         ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
 1906         return ((ssar >> 5) & 0x3);
 1907 }
 1908 
 1909 static enum vm_cpu_mode
 1910 vmx_cpu_mode(void)
 1911 {
 1912         uint32_t csar;
 1913 
 1914         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
 1915                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 1916                 if (csar & 0x2000)
 1917                         return (CPU_MODE_64BIT);        /* CS.L = 1 */
 1918                 else
 1919                         return (CPU_MODE_COMPATIBILITY);
 1920         } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
 1921                 return (CPU_MODE_PROTECTED);
 1922         } else {
 1923                 return (CPU_MODE_REAL);
 1924         }
 1925 }
 1926 
 1927 static enum vm_paging_mode
 1928 vmx_paging_mode(void)
 1929 {
 1930 
 1931         if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
 1932                 return (PAGING_MODE_FLAT);
 1933         if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
 1934                 return (PAGING_MODE_32);
 1935         if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
 1936                 return (PAGING_MODE_64);
 1937         else
 1938                 return (PAGING_MODE_PAE);
 1939 }
 1940 
 1941 static uint64_t
 1942 inout_str_index(struct vmx *vmx, int vcpuid, int in)
 1943 {
 1944         uint64_t val;
 1945         int error;
 1946         enum vm_reg_name reg;
 1947 
 1948         reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
 1949         error = vmx_getreg(vmx, vcpuid, reg, &val);
 1950         KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
 1951         return (val);
 1952 }
 1953 
 1954 static uint64_t
 1955 inout_str_count(struct vmx *vmx, int vcpuid, int rep)
 1956 {
 1957         uint64_t val;
 1958         int error;
 1959 
 1960         if (rep) {
 1961                 error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
 1962                 KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
 1963         } else {
 1964                 val = 1;
 1965         }
 1966         return (val);
 1967 }
 1968 
 1969 static int
 1970 inout_str_addrsize(uint32_t inst_info)
 1971 {
 1972         uint32_t size;
 1973 
 1974         size = (inst_info >> 7) & 0x7;
 1975         switch (size) {
 1976         case 0:
 1977                 return (2);     /* 16 bit */
 1978         case 1:
 1979                 return (4);     /* 32 bit */
 1980         case 2:
 1981                 return (8);     /* 64 bit */
 1982         default:
 1983                 panic("%s: invalid size encoding %d", __func__, size);
 1984         }
 1985 }
 1986 
 1987 static void
 1988 inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
 1989     struct vm_inout_str *vis)
 1990 {
 1991         int error, s;
 1992 
 1993         if (in) {
 1994                 vis->seg_name = VM_REG_GUEST_ES;
 1995         } else {
 1996                 s = (inst_info >> 15) & 0x7;
 1997                 vis->seg_name = vm_segment_name(s);
 1998         }
 1999 
 2000         error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
 2001         KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
 2002 }
 2003 
 2004 static void
 2005 vmx_paging_info(struct vm_guest_paging *paging)
 2006 {
 2007         paging->cr3 = vmcs_guest_cr3();
 2008         paging->cpl = vmx_cpl();
 2009         paging->cpu_mode = vmx_cpu_mode();
 2010         paging->paging_mode = vmx_paging_mode();
 2011 }
 2012 
 2013 static void
 2014 vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
 2015 {
 2016         struct vm_guest_paging *paging;
 2017         uint32_t csar;
 2018 
 2019         paging = &vmexit->u.inst_emul.paging;
 2020 
 2021         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
 2022         vmexit->inst_length = 0;
 2023         vmexit->u.inst_emul.gpa = gpa;
 2024         vmexit->u.inst_emul.gla = gla;
 2025         vmx_paging_info(paging);
 2026         switch (paging->cpu_mode) {
 2027         case CPU_MODE_REAL:
 2028                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 2029                 vmexit->u.inst_emul.cs_d = 0;
 2030                 break;
 2031         case CPU_MODE_PROTECTED:
 2032         case CPU_MODE_COMPATIBILITY:
 2033                 vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
 2034                 csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
 2035                 vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
 2036                 break;
 2037         default:
 2038                 vmexit->u.inst_emul.cs_base = 0;
 2039                 vmexit->u.inst_emul.cs_d = 0;
 2040                 break;
 2041         }
 2042         vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
 2043 }
 2044 
 2045 static int
 2046 ept_fault_type(uint64_t ept_qual)
 2047 {
 2048         int fault_type;
 2049 
 2050         if (ept_qual & EPT_VIOLATION_DATA_WRITE)
 2051                 fault_type = VM_PROT_WRITE;
 2052         else if (ept_qual & EPT_VIOLATION_INST_FETCH)
 2053                 fault_type = VM_PROT_EXECUTE;
 2054         else
 2055                 fault_type= VM_PROT_READ;
 2056 
 2057         return (fault_type);
 2058 }
 2059 
 2060 static bool
 2061 ept_emulation_fault(uint64_t ept_qual)
 2062 {
 2063         int read, write;
 2064 
 2065         /* EPT fault on an instruction fetch doesn't make sense here */
 2066         if (ept_qual & EPT_VIOLATION_INST_FETCH)
 2067                 return (false);
 2068 
 2069         /* EPT fault must be a read fault or a write fault */
 2070         read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
 2071         write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
 2072         if ((read | write) == 0)
 2073                 return (false);
 2074 
 2075         /*
 2076          * The EPT violation must have been caused by accessing a
 2077          * guest-physical address that is a translation of a guest-linear
 2078          * address.
 2079          */
 2080         if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
 2081             (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
 2082                 return (false);
 2083         }
 2084 
 2085         return (true);
 2086 }
 2087 
 2088 static __inline int
 2089 apic_access_virtualization(struct vmx *vmx, int vcpuid)
 2090 {
 2091         uint32_t proc_ctls2;
 2092 
 2093         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 2094         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0);
 2095 }
 2096 
 2097 static __inline int
 2098 x2apic_virtualization(struct vmx *vmx, int vcpuid)
 2099 {
 2100         uint32_t proc_ctls2;
 2101 
 2102         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 2103         return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0);
 2104 }
 2105 
 2106 static int
 2107 vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic,
 2108     uint64_t qual)
 2109 {
 2110         int error, handled, offset;
 2111         uint32_t *apic_regs, vector;
 2112         bool retu;
 2113 
 2114         handled = HANDLED;
 2115         offset = APIC_WRITE_OFFSET(qual);
 2116 
 2117         if (!apic_access_virtualization(vmx, vcpuid)) {
 2118                 /*
 2119                  * In general there should not be any APIC write VM-exits
 2120                  * unless APIC-access virtualization is enabled.
 2121                  *
 2122                  * However self-IPI virtualization can legitimately trigger
 2123                  * an APIC-write VM-exit so treat it specially.
 2124                  */
 2125                 if (x2apic_virtualization(vmx, vcpuid) &&
 2126                     offset == APIC_OFFSET_SELF_IPI) {
 2127                         apic_regs = (uint32_t *)(vlapic->apic_page);
 2128                         vector = apic_regs[APIC_OFFSET_SELF_IPI / 4];
 2129                         vlapic_self_ipi_handler(vlapic, vector);
 2130                         return (HANDLED);
 2131                 } else
 2132                         return (UNHANDLED);
 2133         }
 2134 
 2135         switch (offset) {
 2136         case APIC_OFFSET_ID:
 2137                 vlapic_id_write_handler(vlapic);
 2138                 break;
 2139         case APIC_OFFSET_LDR:
 2140                 vlapic_ldr_write_handler(vlapic);
 2141                 break;
 2142         case APIC_OFFSET_DFR:
 2143                 vlapic_dfr_write_handler(vlapic);
 2144                 break;
 2145         case APIC_OFFSET_SVR:
 2146                 vlapic_svr_write_handler(vlapic);
 2147                 break;
 2148         case APIC_OFFSET_ESR:
 2149                 vlapic_esr_write_handler(vlapic);
 2150                 break;
 2151         case APIC_OFFSET_ICR_LOW:
 2152                 retu = false;
 2153                 error = vlapic_icrlo_write_handler(vlapic, &retu);
 2154                 if (error != 0 || retu)
 2155                         handled = UNHANDLED;
 2156                 break;
 2157         case APIC_OFFSET_CMCI_LVT:
 2158         case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
 2159                 vlapic_lvt_write_handler(vlapic, offset);
 2160                 break;
 2161         case APIC_OFFSET_TIMER_ICR:
 2162                 vlapic_icrtmr_write_handler(vlapic);
 2163                 break;
 2164         case APIC_OFFSET_TIMER_DCR:
 2165                 vlapic_dcr_write_handler(vlapic);
 2166                 break;
 2167         default:
 2168                 handled = UNHANDLED;
 2169                 break;
 2170         }
 2171         return (handled);
 2172 }
 2173 
 2174 static bool
 2175 apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa)
 2176 {
 2177 
 2178         if (apic_access_virtualization(vmx, vcpuid) &&
 2179             (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE))
 2180                 return (true);
 2181         else
 2182                 return (false);
 2183 }
 2184 
 2185 static int
 2186 vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 2187 {
 2188         uint64_t qual;
 2189         int access_type, offset, allowed;
 2190 
 2191         if (!apic_access_virtualization(vmx, vcpuid))
 2192                 return (UNHANDLED);
 2193 
 2194         qual = vmexit->u.vmx.exit_qualification;
 2195         access_type = APIC_ACCESS_TYPE(qual);
 2196         offset = APIC_ACCESS_OFFSET(qual);
 2197 
 2198         allowed = 0;
 2199         if (access_type == 0) {
 2200                 /*
 2201                  * Read data access to the following registers is expected.
 2202                  */
 2203                 switch (offset) {
 2204                 case APIC_OFFSET_APR:
 2205                 case APIC_OFFSET_PPR:
 2206                 case APIC_OFFSET_RRR:
 2207                 case APIC_OFFSET_CMCI_LVT:
 2208                 case APIC_OFFSET_TIMER_CCR:
 2209                         allowed = 1;
 2210                         break;
 2211                 default:
 2212                         break;
 2213                 }
 2214         } else if (access_type == 1) {
 2215                 /*
 2216                  * Write data access to the following registers is expected.
 2217                  */
 2218                 switch (offset) {
 2219                 case APIC_OFFSET_VER:
 2220                 case APIC_OFFSET_APR:
 2221                 case APIC_OFFSET_PPR:
 2222                 case APIC_OFFSET_RRR:
 2223                 case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
 2224                 case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
 2225                 case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
 2226                 case APIC_OFFSET_CMCI_LVT:
 2227                 case APIC_OFFSET_TIMER_CCR:
 2228                         allowed = 1;
 2229                         break;
 2230                 default:
 2231                         break;
 2232                 }
 2233         }
 2234 
 2235         if (allowed) {
 2236                 vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset,
 2237                     VIE_INVALID_GLA);
 2238         }
 2239 
 2240         /*
 2241          * Regardless of whether the APIC-access is allowed this handler
 2242          * always returns UNHANDLED:
 2243          * - if the access is allowed then it is handled by emulating the
 2244          *   instruction that caused the VM-exit (outside the critical section)
 2245          * - if the access is not allowed then it will be converted to an
 2246          *   exitcode of VM_EXITCODE_VMX and will be dealt with in userland.
 2247          */
 2248         return (UNHANDLED);
 2249 }
 2250 
 2251 static enum task_switch_reason
 2252 vmx_task_switch_reason(uint64_t qual)
 2253 {
 2254         int reason;
 2255 
 2256         reason = (qual >> 30) & 0x3;
 2257         switch (reason) {
 2258         case 0:
 2259                 return (TSR_CALL);
 2260         case 1:
 2261                 return (TSR_IRET);
 2262         case 2:
 2263                 return (TSR_JMP);
 2264         case 3:
 2265                 return (TSR_IDT_GATE);
 2266         default:
 2267                 panic("%s: invalid reason %d", __func__, reason);
 2268         }
 2269 }
 2270 
 2271 static int
 2272 emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
 2273 {
 2274         int error;
 2275 
 2276         if (lapic_msr(num))
 2277                 error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
 2278         else
 2279                 error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
 2280 
 2281         return (error);
 2282 }
 2283 
 2284 static int
 2285 emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
 2286 {
 2287         struct vmxctx *vmxctx;
 2288         uint64_t result;
 2289         uint32_t eax, edx;
 2290         int error;
 2291 
 2292         if (lapic_msr(num))
 2293                 error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
 2294         else
 2295                 error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
 2296 
 2297         if (error == 0) {
 2298                 eax = result;
 2299                 vmxctx = &vmx->ctx[vcpuid];
 2300                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
 2301                 KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
 2302 
 2303                 edx = result >> 32;
 2304                 error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
 2305                 KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
 2306         }
 2307 
 2308         return (error);
 2309 }
 2310 
 2311 static int
 2312 vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 2313 {
 2314         int error, errcode, errcode_valid, handled, in;
 2315         struct vmxctx *vmxctx;
 2316         struct vlapic *vlapic;
 2317         struct vm_inout_str *vis;
 2318         struct vm_task_switch *ts;
 2319         uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info;
 2320         uint32_t intr_type, intr_vec, reason;
 2321         uint64_t exitintinfo, qual, gpa;
 2322         bool retu;
 2323 
 2324         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
 2325         CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
 2326 
 2327         handled = UNHANDLED;
 2328         vmxctx = &vmx->ctx[vcpu];
 2329 
 2330         qual = vmexit->u.vmx.exit_qualification;
 2331         reason = vmexit->u.vmx.exit_reason;
 2332         vmexit->exitcode = VM_EXITCODE_BOGUS;
 2333 
 2334         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
 2335         SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit);
 2336 
 2337         /*
 2338          * VM-entry failures during or after loading guest state.
 2339          *
 2340          * These VM-exits are uncommon but must be handled specially
 2341          * as most VM-exit fields are not populated as usual.
 2342          */
 2343         if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) {
 2344                 VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry");
 2345                 __asm __volatile("int $18");
 2346                 return (1);
 2347         }
 2348 
 2349         /*
 2350          * VM exits that can be triggered during event delivery need to
 2351          * be handled specially by re-injecting the event if the IDT
 2352          * vectoring information field's valid bit is set.
 2353          *
 2354          * See "Information for VM Exits During Event Delivery" in Intel SDM
 2355          * for details.
 2356          */
 2357         idtvec_info = vmcs_idt_vectoring_info();
 2358         if (idtvec_info & VMCS_IDT_VEC_VALID) {
 2359                 idtvec_info &= ~(1 << 12); /* clear undefined bit */
 2360                 exitintinfo = idtvec_info;
 2361                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 2362                         idtvec_err = vmcs_idt_vectoring_err();
 2363                         exitintinfo |= (uint64_t)idtvec_err << 32;
 2364                 }
 2365                 error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo);
 2366                 KASSERT(error == 0, ("%s: vm_set_intinfo error %d",
 2367                     __func__, error));
 2368 
 2369                 /*
 2370                  * If 'virtual NMIs' are being used and the VM-exit
 2371                  * happened while injecting an NMI during the previous
 2372                  * VM-entry, then clear "blocking by NMI" in the
 2373                  * Guest Interruptibility-State so the NMI can be
 2374                  * reinjected on the subsequent VM-entry.
 2375                  *
 2376                  * However, if the NMI was being delivered through a task
 2377                  * gate, then the new task must start execution with NMIs
 2378                  * blocked so don't clear NMI blocking in this case.
 2379                  */
 2380                 intr_type = idtvec_info & VMCS_INTR_T_MASK;
 2381                 if (intr_type == VMCS_INTR_T_NMI) {
 2382                         if (reason != EXIT_REASON_TASK_SWITCH)
 2383                                 vmx_clear_nmi_blocking(vmx, vcpu);
 2384                         else
 2385                                 vmx_assert_nmi_blocking(vmx, vcpu);
 2386                 }
 2387 
 2388                 /*
 2389                  * Update VM-entry instruction length if the event being
 2390                  * delivered was a software interrupt or software exception.
 2391                  */
 2392                 if (intr_type == VMCS_INTR_T_SWINTR ||
 2393                     intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION ||
 2394                     intr_type == VMCS_INTR_T_SWEXCEPTION) {
 2395                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 2396                 }
 2397         }
 2398 
 2399         switch (reason) {
 2400         case EXIT_REASON_TASK_SWITCH:
 2401                 ts = &vmexit->u.task_switch;
 2402                 ts->tsssel = qual & 0xffff;
 2403                 ts->reason = vmx_task_switch_reason(qual);
 2404                 ts->ext = 0;
 2405                 ts->errcode_valid = 0;
 2406                 vmx_paging_info(&ts->paging);
 2407                 /*
 2408                  * If the task switch was due to a CALL, JMP, IRET, software
 2409                  * interrupt (INT n) or software exception (INT3, INTO),
 2410                  * then the saved %rip references the instruction that caused
 2411                  * the task switch. The instruction length field in the VMCS
 2412                  * is valid in this case.
 2413                  *
 2414                  * In all other cases (e.g., NMI, hardware exception) the
 2415                  * saved %rip is one that would have been saved in the old TSS
 2416                  * had the task switch completed normally so the instruction
 2417                  * length field is not needed in this case and is explicitly
 2418                  * set to 0.
 2419                  */
 2420                 if (ts->reason == TSR_IDT_GATE) {
 2421                         KASSERT(idtvec_info & VMCS_IDT_VEC_VALID,
 2422                             ("invalid idtvec_info %#x for IDT task switch",
 2423                             idtvec_info));
 2424                         intr_type = idtvec_info & VMCS_INTR_T_MASK;
 2425                         if (intr_type != VMCS_INTR_T_SWINTR &&
 2426                             intr_type != VMCS_INTR_T_SWEXCEPTION &&
 2427                             intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) {
 2428                                 /* Task switch triggered by external event */
 2429                                 ts->ext = 1;
 2430                                 vmexit->inst_length = 0;
 2431                                 if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) {
 2432                                         ts->errcode_valid = 1;
 2433                                         ts->errcode = vmcs_idt_vectoring_err();
 2434                                 }
 2435                         }
 2436                 }
 2437                 vmexit->exitcode = VM_EXITCODE_TASK_SWITCH;
 2438                 SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts);
 2439                 VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, "
 2440                     "%s errcode 0x%016lx", ts->reason, ts->tsssel,
 2441                     ts->ext ? "external" : "internal",
 2442                     ((uint64_t)ts->errcode << 32) | ts->errcode_valid);
 2443                 break;
 2444         case EXIT_REASON_CR_ACCESS:
 2445                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
 2446                 SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual);
 2447                 switch (qual & 0xf) {
 2448                 case 0:
 2449                         handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
 2450                         break;
 2451                 case 4:
 2452                         handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
 2453                         break;
 2454                 case 8:
 2455                         handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
 2456                         break;
 2457                 }
 2458                 break;
 2459         case EXIT_REASON_RDMSR:
 2460                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
 2461                 retu = false;
 2462                 ecx = vmxctx->guest_rcx;
 2463                 VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
 2464                 SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx);
 2465                 error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
 2466                 if (error) {
 2467                         vmexit->exitcode = VM_EXITCODE_RDMSR;
 2468                         vmexit->u.msr.code = ecx;
 2469                 } else if (!retu) {
 2470                         handled = HANDLED;
 2471                 } else {
 2472                         /* Return to userspace with a valid exitcode */
 2473                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 2474                             ("emulate_rdmsr retu with bogus exitcode"));
 2475                 }
 2476                 break;
 2477         case EXIT_REASON_WRMSR:
 2478                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
 2479                 retu = false;
 2480                 eax = vmxctx->guest_rax;
 2481                 ecx = vmxctx->guest_rcx;
 2482                 edx = vmxctx->guest_rdx;
 2483                 VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
 2484                     ecx, (uint64_t)edx << 32 | eax);
 2485                 SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx,
 2486                     (uint64_t)edx << 32 | eax);
 2487                 error = emulate_wrmsr(vmx, vcpu, ecx,
 2488                     (uint64_t)edx << 32 | eax, &retu);
 2489                 if (error) {
 2490                         vmexit->exitcode = VM_EXITCODE_WRMSR;
 2491                         vmexit->u.msr.code = ecx;
 2492                         vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
 2493                 } else if (!retu) {
 2494                         handled = HANDLED;
 2495                 } else {
 2496                         /* Return to userspace with a valid exitcode */
 2497                         KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 2498                             ("emulate_wrmsr retu with bogus exitcode"));
 2499                 }
 2500                 break;
 2501         case EXIT_REASON_HLT:
 2502                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
 2503                 SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 2504                 vmexit->exitcode = VM_EXITCODE_HLT;
 2505                 vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 2506                 if (virtual_interrupt_delivery)
 2507                         vmexit->u.hlt.intr_status =
 2508                             vmcs_read(VMCS_GUEST_INTR_STATUS);
 2509                 else
 2510                         vmexit->u.hlt.intr_status = 0;
 2511                 break;
 2512         case EXIT_REASON_MTF:
 2513                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
 2514                 SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit);
 2515                 vmexit->exitcode = VM_EXITCODE_MTRAP;
 2516                 vmexit->inst_length = 0;
 2517                 break;
 2518         case EXIT_REASON_PAUSE:
 2519                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
 2520                 SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit);
 2521                 vmexit->exitcode = VM_EXITCODE_PAUSE;
 2522                 break;
 2523         case EXIT_REASON_INTR_WINDOW:
 2524                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
 2525                 SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit);
 2526                 vmx_clear_int_window_exiting(vmx, vcpu);
 2527                 return (1);
 2528         case EXIT_REASON_EXT_INTR:
 2529                 /*
 2530                  * External interrupts serve only to cause VM exits and allow
 2531                  * the host interrupt handler to run.
 2532                  *
 2533                  * If this external interrupt triggers a virtual interrupt
 2534                  * to a VM, then that state will be recorded by the
 2535                  * host interrupt handler in the VM's softc. We will inject
 2536                  * this virtual interrupt during the subsequent VM enter.
 2537                  */
 2538                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2539                 SDT_PROBE4(vmm, vmx, exit, interrupt,
 2540                     vmx, vcpu, vmexit, intr_info);
 2541 
 2542                 /*
 2543                  * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
 2544                  * This appears to be a bug in VMware Fusion?
 2545                  */
 2546                 if (!(intr_info & VMCS_INTR_VALID))
 2547                         return (1);
 2548                 KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
 2549                     (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
 2550                     ("VM exit interruption info invalid: %#x", intr_info));
 2551                 vmx_trigger_hostintr(intr_info & 0xff);
 2552 
 2553                 /*
 2554                  * This is special. We want to treat this as an 'handled'
 2555                  * VM-exit but not increment the instruction pointer.
 2556                  */
 2557                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
 2558                 return (1);
 2559         case EXIT_REASON_NMI_WINDOW:
 2560                 SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit);
 2561                 /* Exit to allow the pending virtual NMI to be injected */
 2562                 if (vm_nmi_pending(vmx->vm, vcpu))
 2563                         vmx_inject_nmi(vmx, vcpu);
 2564                 vmx_clear_nmi_window_exiting(vmx, vcpu);
 2565                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
 2566                 return (1);
 2567         case EXIT_REASON_INOUT:
 2568                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
 2569                 vmexit->exitcode = VM_EXITCODE_INOUT;
 2570                 vmexit->u.inout.bytes = (qual & 0x7) + 1;
 2571                 vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
 2572                 vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
 2573                 vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
 2574                 vmexit->u.inout.port = (uint16_t)(qual >> 16);
 2575                 vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
 2576                 if (vmexit->u.inout.string) {
 2577                         inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
 2578                         vmexit->exitcode = VM_EXITCODE_INOUT_STR;
 2579                         vis = &vmexit->u.inout_str;
 2580                         vmx_paging_info(&vis->paging);
 2581                         vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
 2582                         vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
 2583                         vis->index = inout_str_index(vmx, vcpu, in);
 2584                         vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
 2585                         vis->addrsize = inout_str_addrsize(inst_info);
 2586                         inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
 2587                 }
 2588                 SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit);
 2589                 break;
 2590         case EXIT_REASON_CPUID:
 2591                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
 2592                 SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit);
 2593                 handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
 2594                 break;
 2595         case EXIT_REASON_EXCEPTION:
 2596                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
 2597                 intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2598                 KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 2599                     ("VM exit interruption info invalid: %#x", intr_info));
 2600 
 2601                 intr_vec = intr_info & 0xff;
 2602                 intr_type = intr_info & VMCS_INTR_T_MASK;
 2603 
 2604                 /*
 2605                  * If Virtual NMIs control is 1 and the VM-exit is due to a
 2606                  * fault encountered during the execution of IRET then we must
 2607                  * restore the state of "virtual-NMI blocking" before resuming
 2608                  * the guest.
 2609                  *
 2610                  * See "Resuming Guest Software after Handling an Exception".
 2611                  * See "Information for VM Exits Due to Vectored Events".
 2612                  */
 2613                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 2614                     (intr_vec != IDT_DF) &&
 2615                     (intr_info & EXIT_QUAL_NMIUDTI) != 0)
 2616                         vmx_restore_nmi_blocking(vmx, vcpu);
 2617 
 2618                 /*
 2619                  * The NMI has already been handled in vmx_exit_handle_nmi().
 2620                  */
 2621                 if (intr_type == VMCS_INTR_T_NMI)
 2622                         return (1);
 2623 
 2624                 /*
 2625                  * Call the machine check handler by hand. Also don't reflect
 2626                  * the machine check back into the guest.
 2627                  */
 2628                 if (intr_vec == IDT_MC) {
 2629                         VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler");
 2630                         __asm __volatile("int $18");
 2631                         return (1);
 2632                 }
 2633 
 2634                 /*
 2635                  * If the hypervisor has requested user exits for
 2636                  * debug exceptions, bounce them out to userland.
 2637                  */
 2638                 if (intr_type == VMCS_INTR_T_SWEXCEPTION && intr_vec == IDT_BP &&
 2639                     (vmx->cap[vcpu].set & (1 << VM_CAP_BPT_EXIT))) {
 2640                         vmexit->exitcode = VM_EXITCODE_BPT;
 2641                         vmexit->u.bpt.inst_length = vmexit->inst_length;
 2642                         vmexit->inst_length = 0;
 2643                         break;
 2644                 }
 2645 
 2646                 if (intr_vec == IDT_PF) {
 2647                         error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual);
 2648                         KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d",
 2649                             __func__, error));
 2650                 }
 2651 
 2652                 /*
 2653                  * Software exceptions exhibit trap-like behavior. This in
 2654                  * turn requires populating the VM-entry instruction length
 2655                  * so that the %rip in the trap frame is past the INT3/INTO
 2656                  * instruction.
 2657                  */
 2658                 if (intr_type == VMCS_INTR_T_SWEXCEPTION)
 2659                         vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length);
 2660 
 2661                 /* Reflect all other exceptions back into the guest */
 2662                 errcode_valid = errcode = 0;
 2663                 if (intr_info & VMCS_INTR_DEL_ERRCODE) {
 2664                         errcode_valid = 1;
 2665                         errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE);
 2666                 }
 2667                 VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into "
 2668                     "the guest", intr_vec, errcode);
 2669                 SDT_PROBE5(vmm, vmx, exit, exception,
 2670                     vmx, vcpu, vmexit, intr_vec, errcode);
 2671                 error = vm_inject_exception(vmx->vm, vcpu, intr_vec,
 2672                     errcode_valid, errcode, 0);
 2673                 KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 2674                     __func__, error));
 2675                 return (1);
 2676 
 2677         case EXIT_REASON_EPT_FAULT:
 2678                 /*
 2679                  * If 'gpa' lies within the address space allocated to
 2680                  * memory then this must be a nested page fault otherwise
 2681                  * this must be an instruction that accesses MMIO space.
 2682                  */
 2683                 gpa = vmcs_gpa();
 2684                 if (vm_mem_allocated(vmx->vm, vcpu, gpa) ||
 2685                     apic_access_fault(vmx, vcpu, gpa)) {
 2686                         vmexit->exitcode = VM_EXITCODE_PAGING;
 2687                         vmexit->inst_length = 0;
 2688                         vmexit->u.paging.gpa = gpa;
 2689                         vmexit->u.paging.fault_type = ept_fault_type(qual);
 2690                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1);
 2691                         SDT_PROBE5(vmm, vmx, exit, nestedfault,
 2692                             vmx, vcpu, vmexit, gpa, qual);
 2693                 } else if (ept_emulation_fault(qual)) {
 2694                         vmexit_inst_emul(vmexit, gpa, vmcs_gla());
 2695                         vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
 2696                         SDT_PROBE4(vmm, vmx, exit, mmiofault,
 2697                             vmx, vcpu, vmexit, gpa);
 2698                 }
 2699                 /*
 2700                  * If Virtual NMIs control is 1 and the VM-exit is due to an
 2701                  * EPT fault during the execution of IRET then we must restore
 2702                  * the state of "virtual-NMI blocking" before resuming.
 2703                  *
 2704                  * See description of "NMI unblocking due to IRET" in
 2705                  * "Exit Qualification for EPT Violations".
 2706                  */
 2707                 if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
 2708                     (qual & EXIT_QUAL_NMIUDTI) != 0)
 2709                         vmx_restore_nmi_blocking(vmx, vcpu);
 2710                 break;
 2711         case EXIT_REASON_VIRTUALIZED_EOI:
 2712                 vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI;
 2713                 vmexit->u.ioapic_eoi.vector = qual & 0xFF;
 2714                 SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit);
 2715                 vmexit->inst_length = 0;        /* trap-like */
 2716                 break;
 2717         case EXIT_REASON_APIC_ACCESS:
 2718                 SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit);
 2719                 handled = vmx_handle_apic_access(vmx, vcpu, vmexit);
 2720                 break;
 2721         case EXIT_REASON_APIC_WRITE:
 2722                 /*
 2723                  * APIC-write VM exit is trap-like so the %rip is already
 2724                  * pointing to the next instruction.
 2725                  */
 2726                 vmexit->inst_length = 0;
 2727                 vlapic = vm_lapic(vmx->vm, vcpu);
 2728                 SDT_PROBE4(vmm, vmx, exit, apicwrite,
 2729                     vmx, vcpu, vmexit, vlapic);
 2730                 handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual);
 2731                 break;
 2732         case EXIT_REASON_XSETBV:
 2733                 SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit);
 2734                 handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit);
 2735                 break;
 2736         case EXIT_REASON_MONITOR:
 2737                 SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit);
 2738                 vmexit->exitcode = VM_EXITCODE_MONITOR;
 2739                 break;
 2740         case EXIT_REASON_MWAIT:
 2741                 SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit);
 2742                 vmexit->exitcode = VM_EXITCODE_MWAIT;
 2743                 break;
 2744         case EXIT_REASON_TPR:
 2745                 vlapic = vm_lapic(vmx->vm, vcpu);
 2746                 vlapic_sync_tpr(vlapic);
 2747                 vmexit->inst_length = 0;
 2748                 handled = HANDLED;
 2749                 break;
 2750         case EXIT_REASON_VMCALL:
 2751         case EXIT_REASON_VMCLEAR:
 2752         case EXIT_REASON_VMLAUNCH:
 2753         case EXIT_REASON_VMPTRLD:
 2754         case EXIT_REASON_VMPTRST:
 2755         case EXIT_REASON_VMREAD:
 2756         case EXIT_REASON_VMRESUME:
 2757         case EXIT_REASON_VMWRITE:
 2758         case EXIT_REASON_VMXOFF:
 2759         case EXIT_REASON_VMXON:
 2760                 SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit);
 2761                 vmexit->exitcode = VM_EXITCODE_VMINSN;
 2762                 break;
 2763         default:
 2764                 SDT_PROBE4(vmm, vmx, exit, unknown,
 2765                     vmx, vcpu, vmexit, reason);
 2766                 vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
 2767                 break;
 2768         }
 2769 
 2770         if (handled) {
 2771                 /*
 2772                  * It is possible that control is returned to userland
 2773                  * even though we were able to handle the VM exit in the
 2774                  * kernel.
 2775                  *
 2776                  * In such a case we want to make sure that the userland
 2777                  * restarts guest execution at the instruction *after*
 2778                  * the one we just processed. Therefore we update the
 2779                  * guest rip in the VMCS and in 'vmexit'.
 2780                  */
 2781                 vmexit->rip += vmexit->inst_length;
 2782                 vmexit->inst_length = 0;
 2783                 vmcs_write(VMCS_GUEST_RIP, vmexit->rip);
 2784         } else {
 2785                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 2786                         /*
 2787                          * If this VM exit was not claimed by anybody then
 2788                          * treat it as a generic VMX exit.
 2789                          */
 2790                         vmexit->exitcode = VM_EXITCODE_VMX;
 2791                         vmexit->u.vmx.status = VM_SUCCESS;
 2792                         vmexit->u.vmx.inst_type = 0;
 2793                         vmexit->u.vmx.inst_error = 0;
 2794                 } else {
 2795                         /*
 2796                          * The exitcode and collateral have been populated.
 2797                          * The VM exit will be processed further in userland.
 2798                          */
 2799                 }
 2800         }
 2801 
 2802         SDT_PROBE4(vmm, vmx, exit, return,
 2803             vmx, vcpu, vmexit, handled);
 2804         return (handled);
 2805 }
 2806 
 2807 static __inline void
 2808 vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit)
 2809 {
 2810 
 2811         KASSERT(vmxctx->inst_fail_status != VM_SUCCESS,
 2812             ("vmx_exit_inst_error: invalid inst_fail_status %d",
 2813             vmxctx->inst_fail_status));
 2814 
 2815         vmexit->inst_length = 0;
 2816         vmexit->exitcode = VM_EXITCODE_VMX;
 2817         vmexit->u.vmx.status = vmxctx->inst_fail_status;
 2818         vmexit->u.vmx.inst_error = vmcs_instruction_error();
 2819         vmexit->u.vmx.exit_reason = ~0;
 2820         vmexit->u.vmx.exit_qualification = ~0;
 2821 
 2822         switch (rc) {
 2823         case VMX_VMRESUME_ERROR:
 2824         case VMX_VMLAUNCH_ERROR:
 2825                 vmexit->u.vmx.inst_type = rc;
 2826                 break;
 2827         default:
 2828                 panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc);
 2829         }
 2830 }
 2831 
 2832 /*
 2833  * If the NMI-exiting VM execution control is set to '1' then an NMI in
 2834  * non-root operation causes a VM-exit. NMI blocking is in effect so it is
 2835  * sufficient to simply vector to the NMI handler via a software interrupt.
 2836  * However, this must be done before maskable interrupts are enabled
 2837  * otherwise the "iret" issued by an interrupt handler will incorrectly
 2838  * clear NMI blocking.
 2839  */
 2840 static __inline void
 2841 vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit)
 2842 {
 2843         uint32_t intr_info;
 2844 
 2845         KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled"));
 2846 
 2847         if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION)
 2848                 return;
 2849 
 2850         intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
 2851         KASSERT((intr_info & VMCS_INTR_VALID) != 0,
 2852             ("VM exit interruption info invalid: %#x", intr_info));
 2853 
 2854         if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) {
 2855                 KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due "
 2856                     "to NMI has invalid vector: %#x", intr_info));
 2857                 VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler");
 2858                 __asm __volatile("int $2");
 2859         }
 2860 }
 2861 
 2862 static __inline void
 2863 vmx_dr_enter_guest(struct vmxctx *vmxctx)
 2864 {
 2865         register_t rflags;
 2866 
 2867         /* Save host control debug registers. */
 2868         vmxctx->host_dr7 = rdr7();
 2869         vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 2870 
 2871         /*
 2872          * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 2873          * exceptions in the host based on the guest DRx values.  The
 2874          * guest DR7 and DEBUGCTL are saved/restored in the VMCS.
 2875          */
 2876         load_dr7(0);
 2877         wrmsr(MSR_DEBUGCTLMSR, 0);
 2878 
 2879         /*
 2880          * Disable single stepping the kernel to avoid corrupting the
 2881          * guest DR6.  A debugger might still be able to corrupt the
 2882          * guest DR6 by setting a breakpoint after this point and then
 2883          * single stepping.
 2884          */
 2885         rflags = read_rflags();
 2886         vmxctx->host_tf = rflags & PSL_T;
 2887         write_rflags(rflags & ~PSL_T);
 2888 
 2889         /* Save host debug registers. */
 2890         vmxctx->host_dr0 = rdr0();
 2891         vmxctx->host_dr1 = rdr1();
 2892         vmxctx->host_dr2 = rdr2();
 2893         vmxctx->host_dr3 = rdr3();
 2894         vmxctx->host_dr6 = rdr6();
 2895 
 2896         /* Restore guest debug registers. */
 2897         load_dr0(vmxctx->guest_dr0);
 2898         load_dr1(vmxctx->guest_dr1);
 2899         load_dr2(vmxctx->guest_dr2);
 2900         load_dr3(vmxctx->guest_dr3);
 2901         load_dr6(vmxctx->guest_dr6);
 2902 }
 2903 
 2904 static __inline void
 2905 vmx_dr_leave_guest(struct vmxctx *vmxctx)
 2906 {
 2907 
 2908         /* Save guest debug registers. */
 2909         vmxctx->guest_dr0 = rdr0();
 2910         vmxctx->guest_dr1 = rdr1();
 2911         vmxctx->guest_dr2 = rdr2();
 2912         vmxctx->guest_dr3 = rdr3();
 2913         vmxctx->guest_dr6 = rdr6();
 2914 
 2915         /*
 2916          * Restore host debug registers.  Restore DR7, DEBUGCTL, and
 2917          * PSL_T last.
 2918          */
 2919         load_dr0(vmxctx->host_dr0);
 2920         load_dr1(vmxctx->host_dr1);
 2921         load_dr2(vmxctx->host_dr2);
 2922         load_dr3(vmxctx->host_dr3);
 2923         load_dr6(vmxctx->host_dr6);
 2924         wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl);
 2925         load_dr7(vmxctx->host_dr7);
 2926         write_rflags(read_rflags() | vmxctx->host_tf);
 2927 }
 2928 
 2929 static __inline void
 2930 vmx_pmap_activate(struct vmx *vmx, pmap_t pmap)
 2931 {
 2932         long eptgen;
 2933         int cpu;
 2934 
 2935         cpu = curcpu;
 2936 
 2937         CPU_SET_ATOMIC(cpu, &pmap->pm_active);
 2938         eptgen = atomic_load_long(&pmap->pm_eptgen);
 2939         if (eptgen != vmx->eptgen[cpu]) {
 2940                 vmx->eptgen[cpu] = eptgen;
 2941                 invept(INVEPT_TYPE_SINGLE_CONTEXT,
 2942                     (struct invept_desc){ .eptp = vmx->eptp, ._res = 0 });
 2943         }
 2944 }
 2945 
 2946 static __inline void
 2947 vmx_pmap_deactivate(struct vmx *vmx, pmap_t pmap)
 2948 {
 2949         CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
 2950 }
 2951 
 2952 static int
 2953 vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
 2954     struct vm_eventinfo *evinfo)
 2955 {
 2956         int rc, handled, launched;
 2957         struct vmx *vmx;
 2958         struct vm *vm;
 2959         struct vmxctx *vmxctx;
 2960         struct vmcs *vmcs;
 2961         struct vm_exit *vmexit;
 2962         struct vlapic *vlapic;
 2963         uint32_t exit_reason;
 2964         struct region_descriptor gdtr, idtr;
 2965         uint16_t ldt_sel;
 2966 
 2967         vmx = arg;
 2968         vm = vmx->vm;
 2969         vmcs = &vmx->vmcs[vcpu];
 2970         vmxctx = &vmx->ctx[vcpu];
 2971         vlapic = vm_lapic(vm, vcpu);
 2972         vmexit = vm_exitinfo(vm, vcpu);
 2973         launched = 0;
 2974 
 2975         KASSERT(vmxctx->pmap == pmap,
 2976             ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
 2977 
 2978         vmx_msr_guest_enter(vmx, vcpu);
 2979 
 2980         VMPTRLD(vmcs);
 2981 
 2982         /*
 2983          * XXX
 2984          * We do this every time because we may setup the virtual machine
 2985          * from a different process than the one that actually runs it.
 2986          *
 2987          * If the life of a virtual machine was spent entirely in the context
 2988          * of a single process we could do this once in vmx_vminit().
 2989          */
 2990         vmcs_write(VMCS_HOST_CR3, rcr3());
 2991 
 2992         vmcs_write(VMCS_GUEST_RIP, rip);
 2993         vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 2994         do {
 2995                 KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 2996                     "%#lx/%#lx", __func__, vmcs_guest_rip(), rip));
 2997 
 2998                 handled = UNHANDLED;
 2999                 /*
 3000                  * Interrupts are disabled from this point on until the
 3001                  * guest starts executing. This is done for the following
 3002                  * reasons:
 3003                  *
 3004                  * If an AST is asserted on this thread after the check below,
 3005                  * then the IPI_AST notification will not be lost, because it
 3006                  * will cause a VM exit due to external interrupt as soon as
 3007                  * the guest state is loaded.
 3008                  *
 3009                  * A posted interrupt after 'vmx_inject_interrupts()' will
 3010                  * not be "lost" because it will be held pending in the host
 3011                  * APIC because interrupts are disabled. The pending interrupt
 3012                  * will be recognized as soon as the guest state is loaded.
 3013                  *
 3014                  * The same reasoning applies to the IPI generated by
 3015                  * pmap_invalidate_ept().
 3016                  */
 3017                 disable_intr();
 3018                 vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 3019 
 3020                 /*
 3021                  * Check for vcpu suspension after injecting events because
 3022                  * vmx_inject_interrupts() can suspend the vcpu due to a
 3023                  * triple fault.
 3024                  */
 3025                 if (vcpu_suspended(evinfo)) {
 3026                         enable_intr();
 3027                         vm_exit_suspended(vmx->vm, vcpu, rip);
 3028                         break;
 3029                 }
 3030 
 3031                 if (vcpu_rendezvous_pending(evinfo)) {
 3032                         enable_intr();
 3033                         vm_exit_rendezvous(vmx->vm, vcpu, rip);
 3034                         break;
 3035                 }
 3036 
 3037                 if (vcpu_reqidle(evinfo)) {
 3038                         enable_intr();
 3039                         vm_exit_reqidle(vmx->vm, vcpu, rip);
 3040                         break;
 3041                 }
 3042 
 3043                 if (vcpu_should_yield(vm, vcpu)) {
 3044                         enable_intr();
 3045                         vm_exit_astpending(vmx->vm, vcpu, rip);
 3046                         vmx_astpending_trace(vmx, vcpu, rip);
 3047                         handled = HANDLED;
 3048                         break;
 3049                 }
 3050 
 3051                 if (vcpu_debugged(vm, vcpu)) {
 3052                         enable_intr();
 3053                         vm_exit_debug(vmx->vm, vcpu, rip);
 3054                         break;
 3055                 }
 3056 
 3057                 /*
 3058                  * If TPR Shadowing is enabled, the TPR Threshold
 3059                  * must be updated right before entering the guest.
 3060                  */
 3061                 if (tpr_shadowing && !virtual_interrupt_delivery) {
 3062                         if ((vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0) {
 3063                                 vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
 3064                         }
 3065                 }
 3066 
 3067                 /*
 3068                  * VM exits restore the base address but not the
 3069                  * limits of GDTR and IDTR.  The VMCS only stores the
 3070                  * base address, so VM exits set the limits to 0xffff.
 3071                  * Save and restore the full GDTR and IDTR to restore
 3072                  * the limits.
 3073                  *
 3074                  * The VMCS does not save the LDTR at all, and VM
 3075                  * exits clear LDTR as if a NULL selector were loaded.
 3076                  * The userspace hypervisor probably doesn't use a
 3077                  * LDT, but save and restore it to be safe.
 3078                  */
 3079                 sgdt(&gdtr);
 3080                 sidt(&idtr);
 3081                 ldt_sel = sldt();
 3082 
 3083                 /*
 3084                  * The TSC_AUX MSR must be saved/restored while interrupts
 3085                  * are disabled so that it is not possible for the guest
 3086                  * TSC_AUX MSR value to be overwritten by the resume
 3087                  * portion of the IPI_SUSPEND codepath. This is why the
 3088                  * transition of this MSR is handled separately from those
 3089                  * handled by vmx_msr_guest_{enter,exit}(), which are ok to
 3090                  * be transitioned with preemption disabled but interrupts
 3091                  * enabled.
 3092                  *
 3093                  * These vmx_msr_guest_{enter,exit}_tsc_aux() calls can be
 3094                  * anywhere in this loop so long as they happen with
 3095                  * interrupts disabled. This location is chosen for
 3096                  * simplicity.
 3097                  */
 3098                 vmx_msr_guest_enter_tsc_aux(vmx, vcpu);
 3099 
 3100                 vmx_dr_enter_guest(vmxctx);
 3101 
 3102                 /*
 3103                  * Mark the EPT as active on this host CPU and invalidate
 3104                  * EPTP-tagged TLB entries if required.
 3105                  */
 3106                 vmx_pmap_activate(vmx, pmap);
 3107 
 3108                 vmx_run_trace(vmx, vcpu);
 3109                 rc = vmx_enter_guest(vmxctx, vmx, launched);
 3110 
 3111                 vmx_pmap_deactivate(vmx, pmap);
 3112                 vmx_dr_leave_guest(vmxctx);
 3113                 vmx_msr_guest_exit_tsc_aux(vmx, vcpu);
 3114 
 3115                 bare_lgdt(&gdtr);
 3116                 lidt(&idtr);
 3117                 lldt(ldt_sel);
 3118 
 3119                 /* Collect some information for VM exit processing */
 3120                 vmexit->rip = rip = vmcs_guest_rip();
 3121                 vmexit->inst_length = vmexit_instruction_length();
 3122                 vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
 3123                 vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
 3124 
 3125                 /* Update 'nextrip' */
 3126                 vmx->state[vcpu].nextrip = rip;
 3127 
 3128                 if (rc == VMX_GUEST_VMEXIT) {
 3129                         vmx_exit_handle_nmi(vmx, vcpu, vmexit);
 3130                         enable_intr();
 3131                         handled = vmx_exit_process(vmx, vcpu, vmexit);
 3132                 } else {
 3133                         enable_intr();
 3134                         vmx_exit_inst_error(vmxctx, rc, vmexit);
 3135                 }
 3136                 launched = 1;
 3137                 vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
 3138                 rip = vmexit->rip;
 3139         } while (handled);
 3140 
 3141         /*
 3142          * If a VM exit has been handled then the exitcode must be BOGUS
 3143          * If a VM exit is not handled then the exitcode must not be BOGUS
 3144          */
 3145         if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
 3146             (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
 3147                 panic("Mismatch between handled (%d) and exitcode (%d)",
 3148                       handled, vmexit->exitcode);
 3149         }
 3150 
 3151         if (!handled)
 3152                 vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
 3153 
 3154         VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 3155             vmexit->exitcode);
 3156 
 3157         VMCLEAR(vmcs);
 3158         vmx_msr_guest_exit(vmx, vcpu);
 3159 
 3160         return (0);
 3161 }
 3162 
 3163 static void
 3164 vmx_vmcleanup(void *arg)
 3165 {
 3166         int i;
 3167         struct vmx *vmx = arg;
 3168         uint16_t maxcpus;
 3169 
 3170         if (apic_access_virtualization(vmx, 0))
 3171                 vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 3172 
 3173         maxcpus = vm_get_maxcpus(vmx->vm);
 3174         for (i = 0; i < maxcpus; i++)
 3175                 vpid_free(vmx->state[i].vpid);
 3176 
 3177         free(vmx, M_VMX);
 3178 
 3179         return;
 3180 }
 3181 
 3182 static register_t *
 3183 vmxctx_regptr(struct vmxctx *vmxctx, int reg)
 3184 {
 3185 
 3186         switch (reg) {
 3187         case VM_REG_GUEST_RAX:
 3188                 return (&vmxctx->guest_rax);
 3189         case VM_REG_GUEST_RBX:
 3190                 return (&vmxctx->guest_rbx);
 3191         case VM_REG_GUEST_RCX:
 3192                 return (&vmxctx->guest_rcx);
 3193         case VM_REG_GUEST_RDX:
 3194                 return (&vmxctx->guest_rdx);
 3195         case VM_REG_GUEST_RSI:
 3196                 return (&vmxctx->guest_rsi);
 3197         case VM_REG_GUEST_RDI:
 3198                 return (&vmxctx->guest_rdi);
 3199         case VM_REG_GUEST_RBP:
 3200                 return (&vmxctx->guest_rbp);
 3201         case VM_REG_GUEST_R8:
 3202                 return (&vmxctx->guest_r8);
 3203         case VM_REG_GUEST_R9:
 3204                 return (&vmxctx->guest_r9);
 3205         case VM_REG_GUEST_R10:
 3206                 return (&vmxctx->guest_r10);
 3207         case VM_REG_GUEST_R11:
 3208                 return (&vmxctx->guest_r11);
 3209         case VM_REG_GUEST_R12:
 3210                 return (&vmxctx->guest_r12);
 3211         case VM_REG_GUEST_R13:
 3212                 return (&vmxctx->guest_r13);
 3213         case VM_REG_GUEST_R14:
 3214                 return (&vmxctx->guest_r14);
 3215         case VM_REG_GUEST_R15:
 3216                 return (&vmxctx->guest_r15);
 3217         case VM_REG_GUEST_CR2:
 3218                 return (&vmxctx->guest_cr2);
 3219         case VM_REG_GUEST_DR0:
 3220                 return (&vmxctx->guest_dr0);
 3221         case VM_REG_GUEST_DR1:
 3222                 return (&vmxctx->guest_dr1);
 3223         case VM_REG_GUEST_DR2:
 3224                 return (&vmxctx->guest_dr2);
 3225         case VM_REG_GUEST_DR3:
 3226                 return (&vmxctx->guest_dr3);
 3227         case VM_REG_GUEST_DR6:
 3228                 return (&vmxctx->guest_dr6);
 3229         default:
 3230                 break;
 3231         }
 3232         return (NULL);
 3233 }
 3234 
 3235 static int
 3236 vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
 3237 {
 3238         register_t *regp;
 3239 
 3240         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 3241                 *retval = *regp;
 3242                 return (0);
 3243         } else
 3244                 return (EINVAL);
 3245 }
 3246 
 3247 static int
 3248 vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
 3249 {
 3250         register_t *regp;
 3251 
 3252         if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
 3253                 *regp = val;
 3254                 return (0);
 3255         } else
 3256                 return (EINVAL);
 3257 }
 3258 
 3259 static int
 3260 vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval)
 3261 {
 3262         uint64_t gi;
 3263         int error;
 3264 
 3265         error = vmcs_getreg(&vmx->vmcs[vcpu], running,
 3266             VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi);
 3267         *retval = (gi & HWINTR_BLOCKING) ? 1 : 0;
 3268         return (error);
 3269 }
 3270 
 3271 static int
 3272 vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val)
 3273 {
 3274         struct vmcs *vmcs;
 3275         uint64_t gi;
 3276         int error, ident;
 3277 
 3278         /*
 3279          * Forcing the vcpu into an interrupt shadow is not supported.
 3280          */
 3281         if (val) {
 3282                 error = EINVAL;
 3283                 goto done;
 3284         }
 3285 
 3286         vmcs = &vmx->vmcs[vcpu];
 3287         ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY);
 3288         error = vmcs_getreg(vmcs, running, ident, &gi);
 3289         if (error == 0) {
 3290                 gi &= ~HWINTR_BLOCKING;
 3291                 error = vmcs_setreg(vmcs, running, ident, gi);
 3292         }
 3293 done:
 3294         VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val,
 3295             error ? "failed" : "succeeded");
 3296         return (error);
 3297 }
 3298 
 3299 static int
 3300 vmx_shadow_reg(int reg)
 3301 {
 3302         int shreg;
 3303 
 3304         shreg = -1;
 3305 
 3306         switch (reg) {
 3307         case VM_REG_GUEST_CR0:
 3308                 shreg = VMCS_CR0_SHADOW;
 3309                 break;
 3310         case VM_REG_GUEST_CR4:
 3311                 shreg = VMCS_CR4_SHADOW;
 3312                 break;
 3313         default:
 3314                 break;
 3315         }
 3316 
 3317         return (shreg);
 3318 }
 3319 
 3320 static int
 3321 vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
 3322 {
 3323         int running, hostcpu;
 3324         struct vmx *vmx = arg;
 3325 
 3326         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 3327         if (running && hostcpu != curcpu)
 3328                 panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
 3329 
 3330         if (reg == VM_REG_GUEST_INTR_SHADOW)
 3331                 return (vmx_get_intr_shadow(vmx, vcpu, running, retval));
 3332 
 3333         if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
 3334                 return (0);
 3335 
 3336         return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
 3337 }
 3338 
 3339 static int
 3340 vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
 3341 {
 3342         int error, hostcpu, running, shadow;
 3343         uint64_t ctls;
 3344         pmap_t pmap;
 3345         struct vmx *vmx = arg;
 3346 
 3347         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 3348         if (running && hostcpu != curcpu)
 3349                 panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
 3350 
 3351         if (reg == VM_REG_GUEST_INTR_SHADOW)
 3352                 return (vmx_modify_intr_shadow(vmx, vcpu, running, val));
 3353 
 3354         if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
 3355                 return (0);
 3356 
 3357         /* Do not permit user write access to VMCS fields by offset. */
 3358         if (reg < 0)
 3359                 return (EINVAL);
 3360 
 3361         error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
 3362 
 3363         if (error == 0) {
 3364                 /*
 3365                  * If the "load EFER" VM-entry control is 1 then the
 3366                  * value of EFER.LMA must be identical to "IA-32e mode guest"
 3367                  * bit in the VM-entry control.
 3368                  */
 3369                 if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
 3370                     (reg == VM_REG_GUEST_EFER)) {
 3371                         vmcs_getreg(&vmx->vmcs[vcpu], running,
 3372                                     VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
 3373                         if (val & EFER_LMA)
 3374                                 ctls |= VM_ENTRY_GUEST_LMA;
 3375                         else
 3376                                 ctls &= ~VM_ENTRY_GUEST_LMA;
 3377                         vmcs_setreg(&vmx->vmcs[vcpu], running,
 3378                                     VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
 3379                 }
 3380 
 3381                 shadow = vmx_shadow_reg(reg);
 3382                 if (shadow > 0) {
 3383                         /*
 3384                          * Store the unmodified value in the shadow
 3385                          */
 3386                         error = vmcs_setreg(&vmx->vmcs[vcpu], running,
 3387                                     VMCS_IDENT(shadow), val);
 3388                 }
 3389 
 3390                 if (reg == VM_REG_GUEST_CR3) {
 3391                         /*
 3392                          * Invalidate the guest vcpu's TLB mappings to emulate
 3393                          * the behavior of updating %cr3.
 3394                          *
 3395                          * XXX the processor retains global mappings when %cr3
 3396                          * is updated but vmx_invvpid() does not.
 3397                          */
 3398                         pmap = vmx->ctx[vcpu].pmap;
 3399                         vmx_invvpid(vmx, vcpu, pmap, running);
 3400                 }
 3401         }
 3402 
 3403         return (error);
 3404 }
 3405 
 3406 static int
 3407 vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 3408 {
 3409         int hostcpu, running;
 3410         struct vmx *vmx = arg;
 3411 
 3412         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 3413         if (running && hostcpu != curcpu)
 3414                 panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 3415 
 3416         return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
 3417 }
 3418 
 3419 static int
 3420 vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
 3421 {
 3422         int hostcpu, running;
 3423         struct vmx *vmx = arg;
 3424 
 3425         running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
 3426         if (running && hostcpu != curcpu)
 3427                 panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
 3428 
 3429         return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
 3430 }
 3431 
 3432 static int
 3433 vmx_getcap(void *arg, int vcpu, int type, int *retval)
 3434 {
 3435         struct vmx *vmx = arg;
 3436         int vcap;
 3437         int ret;
 3438 
 3439         ret = ENOENT;
 3440 
 3441         vcap = vmx->cap[vcpu].set;
 3442 
 3443         switch (type) {
 3444         case VM_CAP_HALT_EXIT:
 3445                 if (cap_halt_exit)
 3446                         ret = 0;
 3447                 break;
 3448         case VM_CAP_PAUSE_EXIT:
 3449                 if (cap_pause_exit)
 3450                         ret = 0;
 3451                 break;
 3452         case VM_CAP_MTRAP_EXIT:
 3453                 if (cap_monitor_trap)
 3454                         ret = 0;
 3455                 break;
 3456         case VM_CAP_RDPID:
 3457                 if (cap_rdpid)
 3458                         ret = 0;
 3459                 break;
 3460         case VM_CAP_RDTSCP:
 3461                 if (cap_rdtscp)
 3462                         ret = 0;
 3463                 break;
 3464         case VM_CAP_UNRESTRICTED_GUEST:
 3465                 if (cap_unrestricted_guest)
 3466                         ret = 0;
 3467                 break;
 3468         case VM_CAP_ENABLE_INVPCID:
 3469                 if (cap_invpcid)
 3470                         ret = 0;
 3471                 break;
 3472         case VM_CAP_BPT_EXIT:
 3473                 ret = 0;
 3474                 break;
 3475         default:
 3476                 break;
 3477         }
 3478 
 3479         if (ret == 0)
 3480                 *retval = (vcap & (1 << type)) ? 1 : 0;
 3481 
 3482         return (ret);
 3483 }
 3484 
 3485 static int
 3486 vmx_setcap(void *arg, int vcpu, int type, int val)
 3487 {
 3488         struct vmx *vmx = arg;
 3489         struct vmcs *vmcs = &vmx->vmcs[vcpu];
 3490         uint32_t baseval;
 3491         uint32_t *pptr;
 3492         int error;
 3493         int flag;
 3494         int reg;
 3495         int retval;
 3496 
 3497         retval = ENOENT;
 3498         pptr = NULL;
 3499 
 3500         switch (type) {
 3501         case VM_CAP_HALT_EXIT:
 3502                 if (cap_halt_exit) {
 3503                         retval = 0;
 3504                         pptr = &vmx->cap[vcpu].proc_ctls;
 3505                         baseval = *pptr;
 3506                         flag = PROCBASED_HLT_EXITING;
 3507                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3508                 }
 3509                 break;
 3510         case VM_CAP_MTRAP_EXIT:
 3511                 if (cap_monitor_trap) {
 3512                         retval = 0;
 3513                         pptr = &vmx->cap[vcpu].proc_ctls;
 3514                         baseval = *pptr;
 3515                         flag = PROCBASED_MTF;
 3516                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3517                 }
 3518                 break;
 3519         case VM_CAP_PAUSE_EXIT:
 3520                 if (cap_pause_exit) {
 3521                         retval = 0;
 3522                         pptr = &vmx->cap[vcpu].proc_ctls;
 3523                         baseval = *pptr;
 3524                         flag = PROCBASED_PAUSE_EXITING;
 3525                         reg = VMCS_PRI_PROC_BASED_CTLS;
 3526                 }
 3527                 break;
 3528         case VM_CAP_RDPID:
 3529         case VM_CAP_RDTSCP:
 3530                 if (cap_rdpid || cap_rdtscp)
 3531                         /*
 3532                          * Choose not to support enabling/disabling
 3533                          * RDPID/RDTSCP via libvmmapi since, as per the
 3534                          * discussion in vmx_init(), RDPID/RDTSCP are
 3535                          * either always enabled or always disabled.
 3536                          */
 3537                         error = EOPNOTSUPP;
 3538                 break;
 3539         case VM_CAP_UNRESTRICTED_GUEST:
 3540                 if (cap_unrestricted_guest) {
 3541                         retval = 0;
 3542                         pptr = &vmx->cap[vcpu].proc_ctls2;
 3543                         baseval = *pptr;
 3544                         flag = PROCBASED2_UNRESTRICTED_GUEST;
 3545                         reg = VMCS_SEC_PROC_BASED_CTLS;
 3546                 }
 3547                 break;
 3548         case VM_CAP_ENABLE_INVPCID:
 3549                 if (cap_invpcid) {
 3550                         retval = 0;
 3551                         pptr = &vmx->cap[vcpu].proc_ctls2;
 3552                         baseval = *pptr;
 3553                         flag = PROCBASED2_ENABLE_INVPCID;
 3554                         reg = VMCS_SEC_PROC_BASED_CTLS;
 3555                 }
 3556                 break;
 3557         case VM_CAP_BPT_EXIT:
 3558                 retval = 0;
 3559 
 3560                 /* Don't change the bitmap if we are tracing all exceptions. */
 3561                 if (vmx->cap[vcpu].exc_bitmap != 0xffffffff) {
 3562                         pptr = &vmx->cap[vcpu].exc_bitmap;
 3563                         baseval = *pptr;
 3564                         flag = (1 << IDT_BP);
 3565                         reg = VMCS_EXCEPTION_BITMAP;
 3566                 }
 3567                 break;
 3568         default:
 3569                 break;
 3570         }
 3571 
 3572         if (retval)
 3573                 return (retval);
 3574 
 3575         if (pptr != NULL) {
 3576                 if (val) {
 3577                         baseval |= flag;
 3578                 } else {
 3579                         baseval &= ~flag;
 3580                 }
 3581                 VMPTRLD(vmcs);
 3582                 error = vmwrite(reg, baseval);
 3583                 VMCLEAR(vmcs);
 3584 
 3585                 if (error)
 3586                         return (error);
 3587 
 3588                 /*
 3589                  * Update optional stored flags, and record
 3590                  * setting
 3591                  */
 3592                 *pptr = baseval;
 3593         }
 3594 
 3595         if (val) {
 3596                 vmx->cap[vcpu].set |= (1 << type);
 3597         } else {
 3598                 vmx->cap[vcpu].set &= ~(1 << type);
 3599         }
 3600 
 3601         return (0);
 3602 }
 3603 
 3604 struct vlapic_vtx {
 3605         struct vlapic   vlapic;
 3606         struct pir_desc *pir_desc;
 3607         struct vmx      *vmx;
 3608         u_int   pending_prio;
 3609 };
 3610 
 3611 #define VPR_PRIO_BIT(vpr)       (1 << ((vpr) >> 4))
 3612 
 3613 #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)   \
 3614 do {                                                                    \
 3615         VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",     \
 3616             level ? "level" : "edge", vector);                          \
 3617         VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);  \
 3618         VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);  \
 3619         VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);  \
 3620         VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);  \
 3621         VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
 3622 } while (0)
 3623 
 3624 /*
 3625  * vlapic->ops handlers that utilize the APICv hardware assist described in
 3626  * Chapter 29 of the Intel SDM.
 3627  */
 3628 static int
 3629 vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 3630 {
 3631         struct vlapic_vtx *vlapic_vtx;
 3632         struct pir_desc *pir_desc;
 3633         uint64_t mask;
 3634         int idx, notify = 0;
 3635 
 3636         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3637         pir_desc = vlapic_vtx->pir_desc;
 3638 
 3639         /*
 3640          * Keep track of interrupt requests in the PIR descriptor. This is
 3641          * because the virtual APIC page pointed to by the VMCS cannot be
 3642          * modified if the vcpu is running.
 3643          */
 3644         idx = vector / 64;
 3645         mask = 1UL << (vector % 64);
 3646         atomic_set_long(&pir_desc->pir[idx], mask);
 3647 
 3648         /*
 3649          * A notification is required whenever the 'pending' bit makes a
 3650          * transition from 0->1.
 3651          *
 3652          * Even if the 'pending' bit is already asserted, notification about
 3653          * the incoming interrupt may still be necessary.  For example, if a
 3654          * vCPU is HLTed with a high PPR, a low priority interrupt would cause
 3655          * the 0->1 'pending' transition with a notification, but the vCPU
 3656          * would ignore the interrupt for the time being.  The same vCPU would
 3657          * need to then be notified if a high-priority interrupt arrived which
 3658          * satisfied the PPR.
 3659          *
 3660          * The priorities of interrupts injected while 'pending' is asserted
 3661          * are tracked in a custom bitfield 'pending_prio'.  Should the
 3662          * to-be-injected interrupt exceed the priorities already present, the
 3663          * notification is sent.  The priorities recorded in 'pending_prio' are
 3664          * cleared whenever the 'pending' bit makes another 0->1 transition.
 3665          */
 3666         if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
 3667                 notify = 1;
 3668                 vlapic_vtx->pending_prio = 0;
 3669         } else {
 3670                 const u_int old_prio = vlapic_vtx->pending_prio;
 3671                 const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT);
 3672 
 3673                 if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
 3674                         atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
 3675                         notify = 1;
 3676                 }
 3677         }
 3678 
 3679         VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
 3680             level, "vmx_set_intr_ready");
 3681         return (notify);
 3682 }
 3683 
 3684 static int
 3685 vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
 3686 {
 3687         struct vlapic_vtx *vlapic_vtx;
 3688         struct pir_desc *pir_desc;
 3689         struct LAPIC *lapic;
 3690         uint64_t pending, pirval;
 3691         uint32_t ppr, vpr;
 3692         int i;
 3693 
 3694         /*
 3695          * This function is only expected to be called from the 'HLT' exit
 3696          * handler which does not care about the vector that is pending.
 3697          */
 3698         KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
 3699 
 3700         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3701         pir_desc = vlapic_vtx->pir_desc;
 3702 
 3703         pending = atomic_load_acq_long(&pir_desc->pending);
 3704         if (!pending) {
 3705                 /*
 3706                  * While a virtual interrupt may have already been
 3707                  * processed the actual delivery maybe pending the
 3708                  * interruptibility of the guest.  Recognize a pending
 3709                  * interrupt by reevaluating virtual interrupts
 3710                  * following Section 29.2.1 in the Intel SDM Volume 3.
 3711                  */
 3712                 struct vm_exit *vmexit;
 3713                 uint8_t rvi, ppr;
 3714 
 3715                 vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
 3716                 KASSERT(vmexit->exitcode == VM_EXITCODE_HLT,
 3717                     ("vmx_pending_intr: exitcode not 'HLT'"));
 3718                 rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
 3719                 lapic = vlapic->apic_page;
 3720                 ppr = lapic->ppr & APIC_TPR_INT;
 3721                 if (rvi > ppr) {
 3722                         return (1);
 3723                 }
 3724 
 3725                 return (0);
 3726         }
 3727 
 3728         /*
 3729          * If there is an interrupt pending then it will be recognized only
 3730          * if its priority is greater than the processor priority.
 3731          *
 3732          * Special case: if the processor priority is zero then any pending
 3733          * interrupt will be recognized.
 3734          */
 3735         lapic = vlapic->apic_page;
 3736         ppr = lapic->ppr & APIC_TPR_INT;
 3737         if (ppr == 0)
 3738                 return (1);
 3739 
 3740         VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
 3741             lapic->ppr);
 3742 
 3743         vpr = 0;
 3744         for (i = 3; i >= 0; i--) {
 3745                 pirval = pir_desc->pir[i];
 3746                 if (pirval != 0) {
 3747                         vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
 3748                         break;
 3749                 }
 3750         }
 3751 
 3752         /*
 3753          * If the highest-priority pending interrupt falls short of the
 3754          * processor priority of this vCPU, ensure that 'pending_prio' does not
 3755          * have any stale bits which would preclude a higher-priority interrupt
 3756          * from incurring a notification later.
 3757          */
 3758         if (vpr <= ppr) {
 3759                 const u_int prio_bit = VPR_PRIO_BIT(vpr);
 3760                 const u_int old = vlapic_vtx->pending_prio;
 3761 
 3762                 if (old > prio_bit && (old & prio_bit) == 0) {
 3763                         vlapic_vtx->pending_prio = prio_bit;
 3764                 }
 3765                 return (0);
 3766         }
 3767         return (1);
 3768 }
 3769 
 3770 static void
 3771 vmx_intr_accepted(struct vlapic *vlapic, int vector)
 3772 {
 3773 
 3774         panic("vmx_intr_accepted: not expected to be called");
 3775 }
 3776 
 3777 static void
 3778 vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
 3779 {
 3780         struct vlapic_vtx *vlapic_vtx;
 3781         struct vmx *vmx;
 3782         struct vmcs *vmcs;
 3783         uint64_t mask, val;
 3784 
 3785         KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
 3786         KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
 3787             ("vmx_set_tmr: vcpu cannot be running"));
 3788 
 3789         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3790         vmx = vlapic_vtx->vmx;
 3791         vmcs = &vmx->vmcs[vlapic->vcpuid];
 3792         mask = 1UL << (vector % 64);
 3793 
 3794         VMPTRLD(vmcs);
 3795         val = vmcs_read(VMCS_EOI_EXIT(vector));
 3796         if (level)
 3797                 val |= mask;
 3798         else
 3799                 val &= ~mask;
 3800         vmcs_write(VMCS_EOI_EXIT(vector), val);
 3801         VMCLEAR(vmcs);
 3802 }
 3803 
 3804 static void
 3805 vmx_enable_x2apic_mode_ts(struct vlapic *vlapic)
 3806 {
 3807         struct vmx *vmx;
 3808         struct vmcs *vmcs;
 3809         uint32_t proc_ctls;
 3810         int vcpuid;
 3811 
 3812         vcpuid = vlapic->vcpuid;
 3813         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 3814         vmcs = &vmx->vmcs[vcpuid];
 3815 
 3816         proc_ctls = vmx->cap[vcpuid].proc_ctls;
 3817         proc_ctls &= ~PROCBASED_USE_TPR_SHADOW;
 3818         proc_ctls |= PROCBASED_CR8_LOAD_EXITING;
 3819         proc_ctls |= PROCBASED_CR8_STORE_EXITING;
 3820         vmx->cap[vcpuid].proc_ctls = proc_ctls;
 3821 
 3822         VMPTRLD(vmcs);
 3823         vmcs_write(VMCS_PRI_PROC_BASED_CTLS, proc_ctls);
 3824         VMCLEAR(vmcs);
 3825 }
 3826 
 3827 static void
 3828 vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
 3829 {
 3830         struct vmx *vmx;
 3831         struct vmcs *vmcs;
 3832         uint32_t proc_ctls2;
 3833         int vcpuid, error;
 3834 
 3835         vcpuid = vlapic->vcpuid;
 3836         vmx = ((struct vlapic_vtx *)vlapic)->vmx;
 3837         vmcs = &vmx->vmcs[vcpuid];
 3838 
 3839         proc_ctls2 = vmx->cap[vcpuid].proc_ctls2;
 3840         KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0,
 3841             ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2));
 3842 
 3843         proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES;
 3844         proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE;
 3845         vmx->cap[vcpuid].proc_ctls2 = proc_ctls2;
 3846 
 3847         VMPTRLD(vmcs);
 3848         vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2);
 3849         VMCLEAR(vmcs);
 3850 
 3851         if (vlapic->vcpuid == 0) {
 3852                 /*
 3853                  * The nested page table mappings are shared by all vcpus
 3854                  * so unmap the APIC access page just once.
 3855                  */
 3856                 error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE);
 3857                 KASSERT(error == 0, ("%s: vm_unmap_mmio error %d",
 3858                     __func__, error));
 3859 
 3860                 /*
 3861                  * The MSR bitmap is shared by all vcpus so modify it only
 3862                  * once in the context of vcpu 0.
 3863                  */
 3864                 error = vmx_allow_x2apic_msrs(vmx);
 3865                 KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d",
 3866                     __func__, error));
 3867         }
 3868 }
 3869 
 3870 static void
 3871 vmx_post_intr(struct vlapic *vlapic, int hostcpu)
 3872 {
 3873 
 3874         ipi_cpu(hostcpu, pirvec);
 3875 }
 3876 
 3877 /*
 3878  * Transfer the pending interrupts in the PIR descriptor to the IRR
 3879  * in the virtual APIC page.
 3880  */
 3881 static void
 3882 vmx_inject_pir(struct vlapic *vlapic)
 3883 {
 3884         struct vlapic_vtx *vlapic_vtx;
 3885         struct pir_desc *pir_desc;
 3886         struct LAPIC *lapic;
 3887         uint64_t val, pirval;
 3888         int rvi, pirbase = -1;
 3889         uint16_t intr_status_old, intr_status_new;
 3890 
 3891         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3892         pir_desc = vlapic_vtx->pir_desc;
 3893         if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
 3894                 VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 3895                     "no posted interrupt pending");
 3896                 return;
 3897         }
 3898 
 3899         pirval = 0;
 3900         pirbase = -1;
 3901         lapic = vlapic->apic_page;
 3902 
 3903         val = atomic_readandclear_long(&pir_desc->pir[0]);
 3904         if (val != 0) {
 3905                 lapic->irr0 |= val;
 3906                 lapic->irr1 |= val >> 32;
 3907                 pirbase = 0;
 3908                 pirval = val;
 3909         }
 3910 
 3911         val = atomic_readandclear_long(&pir_desc->pir[1]);
 3912         if (val != 0) {
 3913                 lapic->irr2 |= val;
 3914                 lapic->irr3 |= val >> 32;
 3915                 pirbase = 64;
 3916                 pirval = val;
 3917         }
 3918 
 3919         val = atomic_readandclear_long(&pir_desc->pir[2]);
 3920         if (val != 0) {
 3921                 lapic->irr4 |= val;
 3922                 lapic->irr5 |= val >> 32;
 3923                 pirbase = 128;
 3924                 pirval = val;
 3925         }
 3926 
 3927         val = atomic_readandclear_long(&pir_desc->pir[3]);
 3928         if (val != 0) {
 3929                 lapic->irr6 |= val;
 3930                 lapic->irr7 |= val >> 32;
 3931                 pirbase = 192;
 3932                 pirval = val;
 3933         }
 3934 
 3935         VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
 3936 
 3937         /*
 3938          * Update RVI so the processor can evaluate pending virtual
 3939          * interrupts on VM-entry.
 3940          *
 3941          * It is possible for pirval to be 0 here, even though the
 3942          * pending bit has been set. The scenario is:
 3943          * CPU-Y is sending a posted interrupt to CPU-X, which
 3944          * is running a guest and processing posted interrupts in h/w.
 3945          * CPU-X will eventually exit and the state seen in s/w is
 3946          * the pending bit set, but no PIR bits set.
 3947          *
 3948          *      CPU-X                      CPU-Y
 3949          *   (vm running)                (host running)
 3950          *   rx posted interrupt
 3951          *   CLEAR pending bit
 3952          *                               SET PIR bit
 3953          *   READ/CLEAR PIR bits
 3954          *                               SET pending bit
 3955          *   (vm exit)
 3956          *   pending bit set, PIR 0
 3957          */
 3958         if (pirval != 0) {
 3959                 rvi = pirbase + flsl(pirval) - 1;
 3960                 intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
 3961                 intr_status_new = (intr_status_old & 0xFF00) | rvi;
 3962                 if (intr_status_new > intr_status_old) {
 3963                         vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
 3964                         VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
 3965                             "guest_intr_status changed from 0x%04x to 0x%04x",
 3966                             intr_status_old, intr_status_new);
 3967                 }
 3968         }
 3969 }
 3970 
 3971 static struct vlapic *
 3972 vmx_vlapic_init(void *arg, int vcpuid)
 3973 {
 3974         struct vmx *vmx;
 3975         struct vlapic *vlapic;
 3976         struct vlapic_vtx *vlapic_vtx;
 3977 
 3978         vmx = arg;
 3979 
 3980         vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
 3981         vlapic->vm = vmx->vm;
 3982         vlapic->vcpuid = vcpuid;
 3983         vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
 3984 
 3985         vlapic_vtx = (struct vlapic_vtx *)vlapic;
 3986         vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
 3987         vlapic_vtx->vmx = vmx;
 3988 
 3989         if (tpr_shadowing) {
 3990                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
 3991         }
 3992 
 3993         if (virtual_interrupt_delivery) {
 3994                 vlapic->ops.set_intr_ready = vmx_set_intr_ready;
 3995                 vlapic->ops.pending_intr = vmx_pending_intr;
 3996                 vlapic->ops.intr_accepted = vmx_intr_accepted;
 3997                 vlapic->ops.set_tmr = vmx_set_tmr;
 3998                 vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
 3999         }
 4000 
 4001         if (posted_interrupts)
 4002                 vlapic->ops.post_intr = vmx_post_intr;
 4003 
 4004         vlapic_init(vlapic);
 4005 
 4006         return (vlapic);
 4007 }
 4008 
 4009 static void
 4010 vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
 4011 {
 4012 
 4013         vlapic_cleanup(vlapic);
 4014         free(vlapic, M_VLAPIC);
 4015 }
 4016 
 4017 struct vmm_ops vmm_ops_intel = {
 4018         .init           = vmx_init,
 4019         .cleanup        = vmx_cleanup,
 4020         .resume         = vmx_restore,
 4021         .vminit         = vmx_vminit,
 4022         .vmrun          = vmx_run,
 4023         .vmcleanup      = vmx_vmcleanup,
 4024         .vmgetreg       = vmx_getreg,
 4025         .vmsetreg       = vmx_setreg,
 4026         .vmgetdesc      = vmx_getdesc,
 4027         .vmsetdesc      = vmx_setdesc,
 4028         .vmgetcap       = vmx_getcap,
 4029         .vmsetcap       = vmx_setcap,
 4030         .vmspace_alloc  = ept_vmspace_alloc,
 4031         .vmspace_free   = ept_vmspace_free,
 4032         .vlapic_init    = vmx_vlapic_init,
 4033         .vlapic_cleanup = vmx_vlapic_cleanup,
 4034 };

Cache object: 2e437d1aa1ad06b7432cc933a5a34d0e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.