The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/amd/svm.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com)
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice unmodified, this list of conditions, and the following
   12  *    disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
   18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
   20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
   21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
   26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include "opt_bhyve_snapshot.h"
   33 
   34 #include <sys/param.h>
   35 #include <sys/systm.h>
   36 #include <sys/smp.h>
   37 #include <sys/kernel.h>
   38 #include <sys/malloc.h>
   39 #include <sys/pcpu.h>
   40 #include <sys/proc.h>
   41 #include <sys/reg.h>
   42 #include <sys/smr.h>
   43 #include <sys/sysctl.h>
   44 
   45 #include <vm/vm.h>
   46 #include <vm/pmap.h>
   47 
   48 #include <machine/cpufunc.h>
   49 #include <machine/psl.h>
   50 #include <machine/md_var.h>
   51 #include <machine/specialreg.h>
   52 #include <machine/smp.h>
   53 #include <machine/vmm.h>
   54 #include <machine/vmm_dev.h>
   55 #include <machine/vmm_instruction_emul.h>
   56 #include <machine/vmm_snapshot.h>
   57 
   58 #include "vmm_lapic.h"
   59 #include "vmm_stat.h"
   60 #include "vmm_ktr.h"
   61 #include "vmm_ioport.h"
   62 #include "vatpic.h"
   63 #include "vlapic.h"
   64 #include "vlapic_priv.h"
   65 
   66 #include "x86.h"
   67 #include "vmcb.h"
   68 #include "svm.h"
   69 #include "svm_softc.h"
   70 #include "svm_msr.h"
   71 #include "npt.h"
   72 
   73 SYSCTL_DECL(_hw_vmm);
   74 SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
   75     NULL);
   76 
   77 /*
   78  * SVM CPUID function 0x8000_000A, edx bit decoding.
   79  */
   80 #define AMD_CPUID_SVM_NP                BIT(0)  /* Nested paging or RVI */
   81 #define AMD_CPUID_SVM_LBR               BIT(1)  /* Last branch virtualization */
   82 #define AMD_CPUID_SVM_SVML              BIT(2)  /* SVM lock */
   83 #define AMD_CPUID_SVM_NRIP_SAVE         BIT(3)  /* Next RIP is saved */
   84 #define AMD_CPUID_SVM_TSC_RATE          BIT(4)  /* TSC rate control. */
   85 #define AMD_CPUID_SVM_VMCB_CLEAN        BIT(5)  /* VMCB state caching */
   86 #define AMD_CPUID_SVM_FLUSH_BY_ASID     BIT(6)  /* Flush by ASID */
   87 #define AMD_CPUID_SVM_DECODE_ASSIST     BIT(7)  /* Decode assist */
   88 #define AMD_CPUID_SVM_PAUSE_INC         BIT(10) /* Pause intercept filter. */
   89 #define AMD_CPUID_SVM_PAUSE_FTH         BIT(12) /* Pause filter threshold */
   90 #define AMD_CPUID_SVM_AVIC              BIT(13) /* AVIC present */
   91 
   92 #define VMCB_CACHE_DEFAULT      (VMCB_CACHE_ASID        |       \
   93                                 VMCB_CACHE_IOPM         |       \
   94                                 VMCB_CACHE_I            |       \
   95                                 VMCB_CACHE_TPR          |       \
   96                                 VMCB_CACHE_CR2          |       \
   97                                 VMCB_CACHE_CR           |       \
   98                                 VMCB_CACHE_DR           |       \
   99                                 VMCB_CACHE_DT           |       \
  100                                 VMCB_CACHE_SEG          |       \
  101                                 VMCB_CACHE_NP)
  102 
  103 static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT;
  104 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean,
  105     0, NULL);
  106 
  107 static MALLOC_DEFINE(M_SVM, "svm", "svm");
  108 static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic");
  109 
  110 static uint32_t svm_feature = ~0U;      /* AMD SVM features. */
  111 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0,
  112     "SVM features advertised by CPUID.8000000AH:EDX");
  113 
  114 static int disable_npf_assist;
  115 SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN,
  116     &disable_npf_assist, 0, NULL);
  117 
  118 /* Maximum ASIDs supported by the processor */
  119 static uint32_t nasid;
  120 SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0,
  121     "Number of ASIDs supported by this processor");
  122 
  123 /* Current ASID generation for each host cpu */
  124 static struct asid asid[MAXCPU];
  125 
  126 /* 
  127  * SVM host state saved area of size 4KB for each core.
  128  */
  129 static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
  130 
  131 static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery");
  132 static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry");
  133 static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window");
  134 
  135 static int svm_getdesc(void *vcpui, int reg, struct seg_desc *desc);
  136 static int svm_setreg(void *vcpui, int ident, uint64_t val);
  137 
  138 static __inline int
  139 flush_by_asid(void)
  140 {
  141 
  142         return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID);
  143 }
  144 
  145 static __inline int
  146 decode_assist(void)
  147 {
  148 
  149         return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST);
  150 }
  151 
  152 static void
  153 svm_disable(void *arg __unused)
  154 {
  155         uint64_t efer;
  156 
  157         efer = rdmsr(MSR_EFER);
  158         efer &= ~EFER_SVM;
  159         wrmsr(MSR_EFER, efer);
  160 }
  161 
  162 /*
  163  * Disable SVM on all CPUs.
  164  */
  165 static int
  166 svm_modcleanup(void)
  167 {
  168 
  169         smp_rendezvous(NULL, svm_disable, NULL, NULL);
  170         return (0);
  171 }
  172 
  173 /*
  174  * Verify that all the features required by bhyve are available.
  175  */
  176 static int
  177 check_svm_features(void)
  178 {
  179         u_int regs[4];
  180 
  181         /* CPUID Fn8000_000A is for SVM */
  182         do_cpuid(0x8000000A, regs);
  183         svm_feature &= regs[3];
  184 
  185         /*
  186          * The number of ASIDs can be configured to be less than what is
  187          * supported by the hardware but not more.
  188          */
  189         if (nasid == 0 || nasid > regs[1])
  190                 nasid = regs[1];
  191         KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid));
  192 
  193         /* bhyve requires the Nested Paging feature */
  194         if (!(svm_feature & AMD_CPUID_SVM_NP)) {
  195                 printf("SVM: Nested Paging feature not available.\n");
  196                 return (ENXIO);
  197         }
  198 
  199         /* bhyve requires the NRIP Save feature */
  200         if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) {
  201                 printf("SVM: NRIP Save feature not available.\n");
  202                 return (ENXIO);
  203         }
  204 
  205         return (0);
  206 }
  207 
  208 static void
  209 svm_enable(void *arg __unused)
  210 {
  211         uint64_t efer;
  212 
  213         efer = rdmsr(MSR_EFER);
  214         efer |= EFER_SVM;
  215         wrmsr(MSR_EFER, efer);
  216 
  217         wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu]));
  218 }
  219 
  220 /*
  221  * Return 1 if SVM is enabled on this processor and 0 otherwise.
  222  */
  223 static int
  224 svm_available(void)
  225 {
  226         uint64_t msr;
  227 
  228         /* Section 15.4 Enabling SVM from APM2. */
  229         if ((amd_feature2 & AMDID2_SVM) == 0) {
  230                 printf("SVM: not available.\n");
  231                 return (0);
  232         }
  233 
  234         msr = rdmsr(MSR_VM_CR);
  235         if ((msr & VM_CR_SVMDIS) != 0) {
  236                 printf("SVM: disabled by BIOS.\n");
  237                 return (0);
  238         }
  239 
  240         return (1);
  241 }
  242 
  243 static int
  244 svm_modinit(int ipinum)
  245 {
  246         int error, cpu;
  247 
  248         if (!svm_available())
  249                 return (ENXIO);
  250 
  251         error = check_svm_features();
  252         if (error)
  253                 return (error);
  254 
  255         vmcb_clean &= VMCB_CACHE_DEFAULT;
  256 
  257         for (cpu = 0; cpu < MAXCPU; cpu++) {
  258                 /*
  259                  * Initialize the host ASIDs to their "highest" valid values.
  260                  *
  261                  * The next ASID allocation will rollover both 'gen' and 'num'
  262                  * and start off the sequence at {1,1}.
  263                  */
  264                 asid[cpu].gen = ~0UL;
  265                 asid[cpu].num = nasid - 1;
  266         }
  267 
  268         svm_msr_init();
  269         svm_npt_init(ipinum);
  270 
  271         /* Enable SVM on all CPUs */
  272         smp_rendezvous(NULL, svm_enable, NULL, NULL);
  273 
  274         return (0);
  275 }
  276 
  277 static void
  278 svm_modresume(void)
  279 {
  280 
  281         svm_enable(NULL);
  282 }               
  283 
  284 #ifdef BHYVE_SNAPSHOT
  285 void
  286 svm_set_tsc_offset(struct svm_vcpu *vcpu, uint64_t offset)
  287 {
  288         struct vmcb_ctrl *ctrl;
  289 
  290         ctrl = svm_get_vmcb_ctrl(vcpu);
  291         ctrl->tsc_offset = offset;
  292 
  293         svm_set_dirty(vcpu, VMCB_CACHE_I);
  294         SVM_CTR1(vcpu, "tsc offset changed to %#lx", offset);
  295 
  296         vm_set_tsc_offset(vcpu->vcpu, offset);
  297 }
  298 #endif
  299 
  300 /* Pentium compatible MSRs */
  301 #define MSR_PENTIUM_START       0       
  302 #define MSR_PENTIUM_END         0x1FFF
  303 /* AMD 6th generation and Intel compatible MSRs */
  304 #define MSR_AMD6TH_START        0xC0000000UL    
  305 #define MSR_AMD6TH_END          0xC0001FFFUL    
  306 /* AMD 7th and 8th generation compatible MSRs */
  307 #define MSR_AMD7TH_START        0xC0010000UL    
  308 #define MSR_AMD7TH_END          0xC0011FFFUL    
  309 
  310 /*
  311  * Get the index and bit position for a MSR in permission bitmap.
  312  * Two bits are used for each MSR: lower bit for read and higher bit for write.
  313  */
  314 static int
  315 svm_msr_index(uint64_t msr, int *index, int *bit)
  316 {
  317         uint32_t base, off;
  318 
  319         *index = -1;
  320         *bit = (msr % 4) * 2;
  321         base = 0;
  322 
  323         if (msr >= MSR_PENTIUM_START && msr <= MSR_PENTIUM_END) {
  324                 *index = msr / 4;
  325                 return (0);
  326         }
  327 
  328         base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); 
  329         if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) {
  330                 off = (msr - MSR_AMD6TH_START); 
  331                 *index = (off + base) / 4;
  332                 return (0);
  333         } 
  334 
  335         base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1);
  336         if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) {
  337                 off = (msr - MSR_AMD7TH_START);
  338                 *index = (off + base) / 4;
  339                 return (0);
  340         }
  341 
  342         return (EINVAL);
  343 }
  344 
  345 /*
  346  * Allow vcpu to read or write the 'msr' without trapping into the hypervisor.
  347  */
  348 static void
  349 svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write)
  350 {
  351         int index, bit, error __diagused;
  352 
  353         error = svm_msr_index(msr, &index, &bit);
  354         KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr));
  355         KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE,
  356             ("%s: invalid index %d for msr %#lx", __func__, index, msr));
  357         KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d "
  358             "msr %#lx", __func__, bit, msr));
  359 
  360         if (read)
  361                 perm_bitmap[index] &= ~(1UL << bit);
  362 
  363         if (write)
  364                 perm_bitmap[index] &= ~(2UL << bit);
  365 }
  366 
  367 static void
  368 svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr)
  369 {
  370 
  371         svm_msr_perm(perm_bitmap, msr, true, true);
  372 }
  373 
  374 static void
  375 svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr)
  376 {
  377 
  378         svm_msr_perm(perm_bitmap, msr, true, false);
  379 }
  380 
  381 static __inline int
  382 svm_get_intercept(struct svm_vcpu *vcpu, int idx, uint32_t bitmask)
  383 {
  384         struct vmcb_ctrl *ctrl;
  385 
  386         KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
  387 
  388         ctrl = svm_get_vmcb_ctrl(vcpu);
  389         return (ctrl->intercept[idx] & bitmask ? 1 : 0);
  390 }
  391 
  392 static __inline void
  393 svm_set_intercept(struct svm_vcpu *vcpu, int idx, uint32_t bitmask, int enabled)
  394 {
  395         struct vmcb_ctrl *ctrl;
  396         uint32_t oldval;
  397 
  398         KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx));
  399 
  400         ctrl = svm_get_vmcb_ctrl(vcpu);
  401         oldval = ctrl->intercept[idx];
  402 
  403         if (enabled)
  404                 ctrl->intercept[idx] |= bitmask;
  405         else
  406                 ctrl->intercept[idx] &= ~bitmask;
  407 
  408         if (ctrl->intercept[idx] != oldval) {
  409                 svm_set_dirty(vcpu, VMCB_CACHE_I);
  410                 SVM_CTR3(vcpu, "intercept[%d] modified from %#x to %#x", idx,
  411                     oldval, ctrl->intercept[idx]);
  412         }
  413 }
  414 
  415 static __inline void
  416 svm_disable_intercept(struct svm_vcpu *vcpu, int off, uint32_t bitmask)
  417 {
  418 
  419         svm_set_intercept(vcpu, off, bitmask, 0);
  420 }
  421 
  422 static __inline void
  423 svm_enable_intercept(struct svm_vcpu *vcpu, int off, uint32_t bitmask)
  424 {
  425 
  426         svm_set_intercept(vcpu, off, bitmask, 1);
  427 }
  428 
  429 static void
  430 vmcb_init(struct svm_softc *sc, struct svm_vcpu *vcpu, uint64_t iopm_base_pa,
  431     uint64_t msrpm_base_pa, uint64_t np_pml4)
  432 {
  433         struct vmcb_ctrl *ctrl;
  434         struct vmcb_state *state;
  435         uint32_t mask;
  436         int n;
  437 
  438         ctrl = svm_get_vmcb_ctrl(vcpu);
  439         state = svm_get_vmcb_state(vcpu);
  440 
  441         ctrl->iopm_base_pa = iopm_base_pa;
  442         ctrl->msrpm_base_pa = msrpm_base_pa;
  443 
  444         /* Enable nested paging */
  445         ctrl->np_enable = 1;
  446         ctrl->n_cr3 = np_pml4;
  447 
  448         /*
  449          * Intercept accesses to the control registers that are not shadowed
  450          * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8.
  451          */
  452         for (n = 0; n < 16; n++) {
  453                 mask = (BIT(n) << 16) | BIT(n);
  454                 if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8)
  455                         svm_disable_intercept(vcpu, VMCB_CR_INTCPT, mask);
  456                 else
  457                         svm_enable_intercept(vcpu, VMCB_CR_INTCPT, mask);
  458         }
  459 
  460         /*
  461          * Intercept everything when tracing guest exceptions otherwise
  462          * just intercept machine check exception.
  463          */
  464         if (vcpu_trace_exceptions(vcpu->vcpu)) {
  465                 for (n = 0; n < 32; n++) {
  466                         /*
  467                          * Skip unimplemented vectors in the exception bitmap.
  468                          */
  469                         if (n == 2 || n == 9) {
  470                                 continue;
  471                         }
  472                         svm_enable_intercept(vcpu, VMCB_EXC_INTCPT, BIT(n));
  473                 }
  474         } else {
  475                 svm_enable_intercept(vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC));
  476         }
  477 
  478         /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */
  479         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO);
  480         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR);
  481         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID);
  482         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR);
  483         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT);
  484         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI);
  485         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI);
  486         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN);
  487         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_FERR_FREEZE);
  488         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVD);
  489         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INVLPGA);
  490 
  491         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR);
  492         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT);
  493 
  494         /*
  495          * Intercept SVM instructions since AMD enables them in guests otherwise.
  496          * Non-intercepted VMMCALL causes #UD, skip it.
  497          */
  498         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMLOAD);
  499         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMSAVE);
  500         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_STGI);
  501         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_CLGI);
  502         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_SKINIT);
  503         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_ICEBP);
  504         if (vcpu_trap_wbinvd(vcpu->vcpu)) {
  505                 svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT,
  506                     VMCB_INTCPT_WBINVD);
  507         }
  508 
  509         /*
  510          * From section "Canonicalization and Consistency Checks" in APMv2
  511          * the VMRUN intercept bit must be set to pass the consistency check.
  512          */
  513         svm_enable_intercept(vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN);
  514 
  515         /*
  516          * The ASID will be set to a non-zero value just before VMRUN.
  517          */
  518         ctrl->asid = 0;
  519 
  520         /*
  521          * Section 15.21.1, Interrupt Masking in EFLAGS
  522          * Section 15.21.2, Virtualizing APIC.TPR
  523          *
  524          * This must be set for %rflag and %cr8 isolation of guest and host.
  525          */
  526         ctrl->v_intr_masking = 1;
  527 
  528         /* Enable Last Branch Record aka LBR for debugging */
  529         ctrl->lbr_virt_en = 1;
  530         state->dbgctl = BIT(0);
  531 
  532         /* EFER_SVM must always be set when the guest is executing */
  533         state->efer = EFER_SVM;
  534 
  535         /* Set up the PAT to power-on state */
  536         state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK)     |
  537             PAT_VALUE(1, PAT_WRITE_THROUGH)     |
  538             PAT_VALUE(2, PAT_UNCACHED)          |
  539             PAT_VALUE(3, PAT_UNCACHEABLE)       |
  540             PAT_VALUE(4, PAT_WRITE_BACK)        |
  541             PAT_VALUE(5, PAT_WRITE_THROUGH)     |
  542             PAT_VALUE(6, PAT_UNCACHED)          |
  543             PAT_VALUE(7, PAT_UNCACHEABLE);
  544 
  545         /* Set up DR6/7 to power-on state */
  546         state->dr6 = DBREG_DR6_RESERVED1;
  547         state->dr7 = DBREG_DR7_RESERVED1;
  548 }
  549 
  550 /*
  551  * Initialize a virtual machine.
  552  */
  553 static void *
  554 svm_init(struct vm *vm, pmap_t pmap)
  555 {
  556         struct svm_softc *svm_sc;
  557 
  558         svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO);
  559 
  560         svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM,
  561             M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
  562         if (svm_sc->msr_bitmap == NULL)
  563                 panic("contigmalloc of SVM MSR bitmap failed");
  564         svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM,
  565             M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0);
  566         if (svm_sc->iopm_bitmap == NULL)
  567                 panic("contigmalloc of SVM IO bitmap failed");
  568 
  569         svm_sc->vm = vm;
  570         svm_sc->nptp = vtophys(pmap->pm_pmltop);
  571 
  572         /*
  573          * Intercept read and write accesses to all MSRs.
  574          */
  575         memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE);
  576 
  577         /*
  578          * Access to the following MSRs is redirected to the VMCB when the
  579          * guest is executing. Therefore it is safe to allow the guest to
  580          * read/write these MSRs directly without hypervisor involvement.
  581          */
  582         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE);
  583         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE);
  584         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE);
  585 
  586         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR);
  587         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR);
  588         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR);
  589         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK);
  590         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR);
  591         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR);
  592         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR);
  593         svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT);
  594 
  595         svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC);
  596 
  597         /*
  598          * Intercept writes to make sure that the EFER_SVM bit is not cleared.
  599          */
  600         svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER);
  601 
  602         /* Intercept access to all I/O ports. */
  603         memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE);
  604 
  605         return (svm_sc);
  606 }
  607 
  608 static void *
  609 svm_vcpu_init(void *vmi, struct vcpu *vcpu1, int vcpuid)
  610 {
  611         struct svm_softc *sc = vmi;
  612         struct svm_vcpu *vcpu;
  613 
  614         vcpu = malloc(sizeof(*vcpu), M_SVM, M_WAITOK | M_ZERO);
  615         vcpu->sc = sc;
  616         vcpu->vcpu = vcpu1;
  617         vcpu->vcpuid = vcpuid;
  618         vcpu->vmcb = malloc_aligned(sizeof(struct vmcb), PAGE_SIZE, M_SVM,
  619             M_WAITOK | M_ZERO);
  620         vcpu->nextrip = ~0;
  621         vcpu->lastcpu = NOCPU;
  622         vcpu->vmcb_pa = vtophys(vcpu->vmcb);
  623         vmcb_init(sc, vcpu, vtophys(sc->iopm_bitmap), vtophys(sc->msr_bitmap),
  624             sc->nptp);
  625         svm_msr_guest_init(sc, vcpu);
  626         return (vcpu);
  627 }
  628 
  629 /*
  630  * Collateral for a generic SVM VM-exit.
  631  */
  632 static void
  633 vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2)
  634 {
  635 
  636         vme->exitcode = VM_EXITCODE_SVM;
  637         vme->u.svm.exitcode = code;
  638         vme->u.svm.exitinfo1 = info1;
  639         vme->u.svm.exitinfo2 = info2;
  640 }
  641 
  642 static int
  643 svm_cpl(struct vmcb_state *state)
  644 {
  645 
  646         /*
  647          * From APMv2:
  648          *   "Retrieve the CPL from the CPL field in the VMCB, not
  649          *    from any segment DPL"
  650          */
  651         return (state->cpl);
  652 }
  653 
  654 static enum vm_cpu_mode
  655 svm_vcpu_mode(struct vmcb *vmcb)
  656 {
  657         struct vmcb_segment seg;
  658         struct vmcb_state *state;
  659         int error __diagused;
  660 
  661         state = &vmcb->state;
  662 
  663         if (state->efer & EFER_LMA) {
  664                 error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
  665                 KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__,
  666                     error));
  667 
  668                 /*
  669                  * Section 4.8.1 for APM2, check if Code Segment has
  670                  * Long attribute set in descriptor.
  671                  */
  672                 if (seg.attrib & VMCB_CS_ATTRIB_L)
  673                         return (CPU_MODE_64BIT);
  674                 else
  675                         return (CPU_MODE_COMPATIBILITY);
  676         } else  if (state->cr0 & CR0_PE) {
  677                 return (CPU_MODE_PROTECTED);
  678         } else {
  679                 return (CPU_MODE_REAL);
  680         }
  681 }
  682 
  683 static enum vm_paging_mode
  684 svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer)
  685 {
  686 
  687         if ((cr0 & CR0_PG) == 0)
  688                 return (PAGING_MODE_FLAT);
  689         if ((cr4 & CR4_PAE) == 0)
  690                 return (PAGING_MODE_32);
  691         if (efer & EFER_LME)
  692                 return (PAGING_MODE_64);
  693         else
  694                 return (PAGING_MODE_PAE);
  695 }
  696 
  697 /*
  698  * ins/outs utility routines
  699  */
  700 static uint64_t
  701 svm_inout_str_index(struct svm_regctx *regs, int in)
  702 {
  703         uint64_t val;
  704 
  705         val = in ? regs->sctx_rdi : regs->sctx_rsi;
  706 
  707         return (val);
  708 }
  709 
  710 static uint64_t
  711 svm_inout_str_count(struct svm_regctx *regs, int rep)
  712 {
  713         uint64_t val;
  714 
  715         val = rep ? regs->sctx_rcx : 1;
  716 
  717         return (val);
  718 }
  719 
  720 static void
  721 svm_inout_str_seginfo(struct svm_vcpu *vcpu, int64_t info1, int in,
  722     struct vm_inout_str *vis)
  723 {
  724         int error __diagused, s;
  725 
  726         if (in) {
  727                 vis->seg_name = VM_REG_GUEST_ES;
  728         } else {
  729                 /* The segment field has standard encoding */
  730                 s = (info1 >> 10) & 0x7;
  731                 vis->seg_name = vm_segment_name(s);
  732         }
  733 
  734         error = svm_getdesc(vcpu, vis->seg_name, &vis->seg_desc);
  735         KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error));
  736 }
  737 
  738 static int
  739 svm_inout_str_addrsize(uint64_t info1)
  740 {
  741         uint32_t size;
  742 
  743         size = (info1 >> 7) & 0x7;
  744         switch (size) {
  745         case 1:
  746                 return (2);     /* 16 bit */
  747         case 2:
  748                 return (4);     /* 32 bit */
  749         case 4:
  750                 return (8);     /* 64 bit */
  751         default:
  752                 panic("%s: invalid size encoding %d", __func__, size);
  753         }
  754 }
  755 
  756 static void
  757 svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging)
  758 {
  759         struct vmcb_state *state;
  760 
  761         state = &vmcb->state;
  762         paging->cr3 = state->cr3;
  763         paging->cpl = svm_cpl(state);
  764         paging->cpu_mode = svm_vcpu_mode(vmcb);
  765         paging->paging_mode = svm_paging_mode(state->cr0, state->cr4,
  766             state->efer);
  767 }
  768 
  769 #define UNHANDLED 0
  770 
  771 /*
  772  * Handle guest I/O intercept.
  773  */
  774 static int
  775 svm_handle_io(struct svm_vcpu *vcpu, struct vm_exit *vmexit)
  776 {
  777         struct vmcb_ctrl *ctrl;
  778         struct vmcb_state *state;
  779         struct svm_regctx *regs;
  780         struct vm_inout_str *vis;
  781         uint64_t info1;
  782         int inout_string;
  783 
  784         state = svm_get_vmcb_state(vcpu);
  785         ctrl  = svm_get_vmcb_ctrl(vcpu);
  786         regs  = svm_get_guest_regctx(vcpu);
  787 
  788         info1 = ctrl->exitinfo1;
  789         inout_string = info1 & BIT(2) ? 1 : 0;
  790 
  791         /*
  792          * The effective segment number in EXITINFO1[12:10] is populated
  793          * only if the processor has the DecodeAssist capability.
  794          *
  795          * XXX this is not specified explicitly in APMv2 but can be verified
  796          * empirically.
  797          */
  798         if (inout_string && !decode_assist())
  799                 return (UNHANDLED);
  800 
  801         vmexit->exitcode        = VM_EXITCODE_INOUT;
  802         vmexit->u.inout.in      = (info1 & BIT(0)) ? 1 : 0;
  803         vmexit->u.inout.string  = inout_string;
  804         vmexit->u.inout.rep     = (info1 & BIT(3)) ? 1 : 0;
  805         vmexit->u.inout.bytes   = (info1 >> 4) & 0x7;
  806         vmexit->u.inout.port    = (uint16_t)(info1 >> 16);
  807         vmexit->u.inout.eax     = (uint32_t)(state->rax);
  808 
  809         if (inout_string) {
  810                 vmexit->exitcode = VM_EXITCODE_INOUT_STR;
  811                 vis = &vmexit->u.inout_str;
  812                 svm_paging_info(svm_get_vmcb(vcpu), &vis->paging);
  813                 vis->rflags = state->rflags;
  814                 vis->cr0 = state->cr0;
  815                 vis->index = svm_inout_str_index(regs, vmexit->u.inout.in);
  816                 vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep);
  817                 vis->addrsize = svm_inout_str_addrsize(info1);
  818                 svm_inout_str_seginfo(vcpu, info1, vmexit->u.inout.in, vis);
  819         }
  820 
  821         return (UNHANDLED);
  822 }
  823 
  824 static int
  825 npf_fault_type(uint64_t exitinfo1)
  826 {
  827 
  828         if (exitinfo1 & VMCB_NPF_INFO1_W)
  829                 return (VM_PROT_WRITE);
  830         else if (exitinfo1 & VMCB_NPF_INFO1_ID)
  831                 return (VM_PROT_EXECUTE);
  832         else
  833                 return (VM_PROT_READ);
  834 }
  835 
  836 static bool
  837 svm_npf_emul_fault(uint64_t exitinfo1)
  838 {
  839 
  840         if (exitinfo1 & VMCB_NPF_INFO1_ID) {
  841                 return (false);
  842         }
  843 
  844         if (exitinfo1 & VMCB_NPF_INFO1_GPT) {
  845                 return (false);
  846         }
  847 
  848         if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) {
  849                 return (false);
  850         }
  851 
  852         return (true);  
  853 }
  854 
  855 static void
  856 svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit)
  857 {
  858         struct vm_guest_paging *paging;
  859         struct vmcb_segment seg;
  860         struct vmcb_ctrl *ctrl;
  861         char *inst_bytes;
  862         int error __diagused, inst_len;
  863 
  864         ctrl = &vmcb->ctrl;
  865         paging = &vmexit->u.inst_emul.paging;
  866 
  867         vmexit->exitcode = VM_EXITCODE_INST_EMUL;
  868         vmexit->u.inst_emul.gpa = gpa;
  869         vmexit->u.inst_emul.gla = VIE_INVALID_GLA;
  870         svm_paging_info(vmcb, paging);
  871 
  872         error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg);
  873         KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error));
  874 
  875         switch(paging->cpu_mode) {
  876         case CPU_MODE_REAL:
  877                 vmexit->u.inst_emul.cs_base = seg.base;
  878                 vmexit->u.inst_emul.cs_d = 0;
  879                 break;
  880         case CPU_MODE_PROTECTED:
  881         case CPU_MODE_COMPATIBILITY:
  882                 vmexit->u.inst_emul.cs_base = seg.base;
  883 
  884                 /*
  885                  * Section 4.8.1 of APM2, Default Operand Size or D bit.
  886                  */
  887                 vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ?
  888                     1 : 0;
  889                 break;
  890         default:
  891                 vmexit->u.inst_emul.cs_base = 0;
  892                 vmexit->u.inst_emul.cs_d = 0;
  893                 break;  
  894         }
  895 
  896         /*
  897          * Copy the instruction bytes into 'vie' if available.
  898          */
  899         if (decode_assist() && !disable_npf_assist) {
  900                 inst_len = ctrl->inst_len;
  901                 inst_bytes = ctrl->inst_bytes;
  902         } else {
  903                 inst_len = 0;
  904                 inst_bytes = NULL;
  905         }
  906         vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len);
  907 }
  908 
  909 #ifdef KTR
  910 static const char *
  911 intrtype_to_str(int intr_type)
  912 {
  913         switch (intr_type) {
  914         case VMCB_EVENTINJ_TYPE_INTR:
  915                 return ("hwintr");
  916         case VMCB_EVENTINJ_TYPE_NMI:
  917                 return ("nmi");
  918         case VMCB_EVENTINJ_TYPE_INTn:
  919                 return ("swintr");
  920         case VMCB_EVENTINJ_TYPE_EXCEPTION:
  921                 return ("exception");
  922         default:
  923                 panic("%s: unknown intr_type %d", __func__, intr_type);
  924         }
  925 }
  926 #endif
  927 
  928 /*
  929  * Inject an event to vcpu as described in section 15.20, "Event injection".
  930  */
  931 static void
  932 svm_eventinject(struct svm_vcpu *vcpu, int intr_type, int vector,
  933     uint32_t error, bool ec_valid)
  934 {
  935         struct vmcb_ctrl *ctrl;
  936 
  937         ctrl = svm_get_vmcb_ctrl(vcpu);
  938 
  939         KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
  940             ("%s: event already pending %#lx", __func__, ctrl->eventinj));
  941 
  942         KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
  943             __func__, vector));
  944 
  945         switch (intr_type) {
  946         case VMCB_EVENTINJ_TYPE_INTR:
  947         case VMCB_EVENTINJ_TYPE_NMI:
  948         case VMCB_EVENTINJ_TYPE_INTn:
  949                 break;
  950         case VMCB_EVENTINJ_TYPE_EXCEPTION:
  951                 if (vector >= 0 && vector <= 31 && vector != 2)
  952                         break;
  953                 /* FALLTHROUGH */
  954         default:
  955                 panic("%s: invalid intr_type/vector: %d/%d", __func__,
  956                     intr_type, vector);
  957         }
  958         ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
  959         if (ec_valid) {
  960                 ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
  961                 ctrl->eventinj |= (uint64_t)error << 32;
  962                 SVM_CTR3(vcpu, "Injecting %s at vector %d errcode %#x",
  963                     intrtype_to_str(intr_type), vector, error);
  964         } else {
  965                 SVM_CTR2(vcpu, "Injecting %s at vector %d",
  966                     intrtype_to_str(intr_type), vector);
  967         }
  968 }
  969 
  970 static void
  971 svm_update_virqinfo(struct svm_vcpu *vcpu)
  972 {
  973         struct vlapic *vlapic;
  974         struct vmcb_ctrl *ctrl;
  975 
  976         vlapic = vm_lapic(vcpu->vcpu);
  977         ctrl = svm_get_vmcb_ctrl(vcpu);
  978 
  979         /* Update %cr8 in the emulated vlapic */
  980         vlapic_set_cr8(vlapic, ctrl->v_tpr);
  981 
  982         /* Virtual interrupt injection is not used. */
  983         KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid "
  984             "v_intr_vector %d", __func__, ctrl->v_intr_vector));
  985 }
  986 
  987 static void
  988 svm_save_intinfo(struct svm_softc *svm_sc, struct svm_vcpu *vcpu)
  989 {
  990         struct vmcb_ctrl *ctrl;
  991         uint64_t intinfo;
  992 
  993         ctrl = svm_get_vmcb_ctrl(vcpu);
  994         intinfo = ctrl->exitintinfo;    
  995         if (!VMCB_EXITINTINFO_VALID(intinfo))
  996                 return;
  997 
  998         /*
  999          * From APMv2, Section "Intercepts during IDT interrupt delivery"
 1000          *
 1001          * If a #VMEXIT happened during event delivery then record the event
 1002          * that was being delivered.
 1003          */
 1004         SVM_CTR2(vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", intinfo,
 1005             VMCB_EXITINTINFO_VECTOR(intinfo));
 1006         vmm_stat_incr(vcpu->vcpu, VCPU_EXITINTINFO, 1);
 1007         vm_exit_intinfo(vcpu->vcpu, intinfo);
 1008 }
 1009 
 1010 #ifdef INVARIANTS
 1011 static __inline int
 1012 vintr_intercept_enabled(struct svm_vcpu *vcpu)
 1013 {
 1014 
 1015         return (svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR));
 1016 }
 1017 #endif
 1018 
 1019 static __inline void
 1020 enable_intr_window_exiting(struct svm_vcpu *vcpu)
 1021 {
 1022         struct vmcb_ctrl *ctrl;
 1023 
 1024         ctrl = svm_get_vmcb_ctrl(vcpu);
 1025 
 1026         if (ctrl->v_irq && ctrl->v_intr_vector == 0) {
 1027                 KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__));
 1028                 KASSERT(vintr_intercept_enabled(vcpu),
 1029                     ("%s: vintr intercept should be enabled", __func__));
 1030                 return;
 1031         }
 1032 
 1033         SVM_CTR0(vcpu, "Enable intr window exiting");
 1034         ctrl->v_irq = 1;
 1035         ctrl->v_ign_tpr = 1;
 1036         ctrl->v_intr_vector = 0;
 1037         svm_set_dirty(vcpu, VMCB_CACHE_TPR);
 1038         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 1039 }
 1040 
 1041 static __inline void
 1042 disable_intr_window_exiting(struct svm_vcpu *vcpu)
 1043 {
 1044         struct vmcb_ctrl *ctrl;
 1045 
 1046         ctrl = svm_get_vmcb_ctrl(vcpu);
 1047 
 1048         if (!ctrl->v_irq && ctrl->v_intr_vector == 0) {
 1049                 KASSERT(!vintr_intercept_enabled(vcpu),
 1050                     ("%s: vintr intercept should be disabled", __func__));
 1051                 return;
 1052         }
 1053 
 1054         SVM_CTR0(vcpu, "Disable intr window exiting");
 1055         ctrl->v_irq = 0;
 1056         ctrl->v_intr_vector = 0;
 1057         svm_set_dirty(vcpu, VMCB_CACHE_TPR);
 1058         svm_disable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 1059 }
 1060 
 1061 static int
 1062 svm_modify_intr_shadow(struct svm_vcpu *vcpu, uint64_t val)
 1063 {
 1064         struct vmcb_ctrl *ctrl;
 1065         int oldval, newval;
 1066 
 1067         ctrl = svm_get_vmcb_ctrl(vcpu);
 1068         oldval = ctrl->intr_shadow;
 1069         newval = val ? 1 : 0;
 1070         if (newval != oldval) {
 1071                 ctrl->intr_shadow = newval;
 1072                 SVM_CTR1(vcpu, "Setting intr_shadow to %d", newval);
 1073         }
 1074         return (0);
 1075 }
 1076 
 1077 static int
 1078 svm_get_intr_shadow(struct svm_vcpu *vcpu, uint64_t *val)
 1079 {
 1080         struct vmcb_ctrl *ctrl;
 1081 
 1082         ctrl = svm_get_vmcb_ctrl(vcpu);
 1083         *val = ctrl->intr_shadow;
 1084         return (0);
 1085 }
 1086 
 1087 /*
 1088  * Once an NMI is injected it blocks delivery of further NMIs until the handler
 1089  * executes an IRET. The IRET intercept is enabled when an NMI is injected to
 1090  * to track when the vcpu is done handling the NMI.
 1091  */
 1092 static int
 1093 nmi_blocked(struct svm_vcpu *vcpu)
 1094 {
 1095         int blocked;
 1096 
 1097         blocked = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 1098         return (blocked);
 1099 }
 1100 
 1101 static void
 1102 enable_nmi_blocking(struct svm_vcpu *vcpu)
 1103 {
 1104 
 1105         KASSERT(!nmi_blocked(vcpu), ("vNMI already blocked"));
 1106         SVM_CTR0(vcpu, "vNMI blocking enabled");
 1107         svm_enable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 1108 }
 1109 
 1110 static void
 1111 clear_nmi_blocking(struct svm_vcpu *vcpu)
 1112 {
 1113         int error __diagused;
 1114 
 1115         KASSERT(nmi_blocked(vcpu), ("vNMI already unblocked"));
 1116         SVM_CTR0(vcpu, "vNMI blocking cleared");
 1117         /*
 1118          * When the IRET intercept is cleared the vcpu will attempt to execute
 1119          * the "iret" when it runs next. However, it is possible to inject
 1120          * another NMI into the vcpu before the "iret" has actually executed.
 1121          *
 1122          * For e.g. if the "iret" encounters a #NPF when accessing the stack
 1123          * it will trap back into the hypervisor. If an NMI is pending for
 1124          * the vcpu it will be injected into the guest.
 1125          *
 1126          * XXX this needs to be fixed
 1127          */
 1128         svm_disable_intercept(vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 1129 
 1130         /*
 1131          * Set 'intr_shadow' to prevent an NMI from being injected on the
 1132          * immediate VMRUN.
 1133          */
 1134         error = svm_modify_intr_shadow(vcpu, 1);
 1135         KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error));
 1136 }
 1137 
 1138 #define EFER_MBZ_BITS   0xFFFFFFFFFFFF0200UL
 1139 
 1140 static int
 1141 svm_write_efer(struct svm_softc *sc, struct svm_vcpu *vcpu, uint64_t newval,
 1142     bool *retu)
 1143 {
 1144         struct vm_exit *vme;
 1145         struct vmcb_state *state;
 1146         uint64_t changed, lma, oldval;
 1147         int error __diagused;
 1148 
 1149         state = svm_get_vmcb_state(vcpu);
 1150 
 1151         oldval = state->efer;
 1152         SVM_CTR2(vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval);
 1153 
 1154         newval &= ~0xFE;                /* clear the Read-As-Zero (RAZ) bits */
 1155         changed = oldval ^ newval;
 1156 
 1157         if (newval & EFER_MBZ_BITS)
 1158                 goto gpf;
 1159 
 1160         /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */
 1161         if (changed & EFER_LME) {
 1162                 if (state->cr0 & CR0_PG)
 1163                         goto gpf;
 1164         }
 1165 
 1166         /* EFER.LMA = EFER.LME & CR0.PG */
 1167         if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0)
 1168                 lma = EFER_LMA;
 1169         else
 1170                 lma = 0;
 1171 
 1172         if ((newval & EFER_LMA) != lma)
 1173                 goto gpf;
 1174 
 1175         if (newval & EFER_NXE) {
 1176                 if (!vm_cpuid_capability(vcpu->vcpu, VCC_NO_EXECUTE))
 1177                         goto gpf;
 1178         }
 1179 
 1180         /*
 1181          * XXX bhyve does not enforce segment limits in 64-bit mode. Until
 1182          * this is fixed flag guest attempt to set EFER_LMSLE as an error.
 1183          */
 1184         if (newval & EFER_LMSLE) {
 1185                 vme = vm_exitinfo(vcpu->vcpu);
 1186                 vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0);
 1187                 *retu = true;
 1188                 return (0);
 1189         }
 1190 
 1191         if (newval & EFER_FFXSR) {
 1192                 if (!vm_cpuid_capability(vcpu->vcpu, VCC_FFXSR))
 1193                         goto gpf;
 1194         }
 1195 
 1196         if (newval & EFER_TCE) {
 1197                 if (!vm_cpuid_capability(vcpu->vcpu, VCC_TCE))
 1198                         goto gpf;
 1199         }
 1200 
 1201         error = svm_setreg(vcpu, VM_REG_GUEST_EFER, newval);
 1202         KASSERT(error == 0, ("%s: error %d updating efer", __func__, error));
 1203         return (0);
 1204 gpf:
 1205         vm_inject_gp(vcpu->vcpu);
 1206         return (0);
 1207 }
 1208 
 1209 static int
 1210 emulate_wrmsr(struct svm_softc *sc, struct svm_vcpu *vcpu, u_int num,
 1211     uint64_t val, bool *retu)
 1212 {
 1213         int error;
 1214 
 1215         if (lapic_msr(num))
 1216                 error = lapic_wrmsr(vcpu->vcpu, num, val, retu);
 1217         else if (num == MSR_EFER)
 1218                 error = svm_write_efer(sc, vcpu, val, retu);
 1219         else
 1220                 error = svm_wrmsr(vcpu, num, val, retu);
 1221 
 1222         return (error);
 1223 }
 1224 
 1225 static int
 1226 emulate_rdmsr(struct svm_vcpu *vcpu, u_int num, bool *retu)
 1227 {
 1228         struct vmcb_state *state;
 1229         struct svm_regctx *ctx;
 1230         uint64_t result;
 1231         int error;
 1232 
 1233         if (lapic_msr(num))
 1234                 error = lapic_rdmsr(vcpu->vcpu, num, &result, retu);
 1235         else
 1236                 error = svm_rdmsr(vcpu, num, &result, retu);
 1237 
 1238         if (error == 0) {
 1239                 state = svm_get_vmcb_state(vcpu);
 1240                 ctx = svm_get_guest_regctx(vcpu);
 1241                 state->rax = result & 0xffffffff;
 1242                 ctx->sctx_rdx = result >> 32;
 1243         }
 1244 
 1245         return (error);
 1246 }
 1247 
 1248 #ifdef KTR
 1249 static const char *
 1250 exit_reason_to_str(uint64_t reason)
 1251 {
 1252         int i;
 1253         static char reasonbuf[32];
 1254         static const struct {
 1255                 int reason;
 1256                 const char *str;
 1257         } reasons[] = {
 1258                 { .reason = VMCB_EXIT_INVALID,  .str = "invalvmcb" },
 1259                 { .reason = VMCB_EXIT_SHUTDOWN, .str = "shutdown" },
 1260                 { .reason = VMCB_EXIT_NPF,      .str = "nptfault" },
 1261                 { .reason = VMCB_EXIT_PAUSE,    .str = "pause" },
 1262                 { .reason = VMCB_EXIT_HLT,      .str = "hlt" },
 1263                 { .reason = VMCB_EXIT_CPUID,    .str = "cpuid" },
 1264                 { .reason = VMCB_EXIT_IO,       .str = "inout" },
 1265                 { .reason = VMCB_EXIT_MC,       .str = "mchk" },
 1266                 { .reason = VMCB_EXIT_INTR,     .str = "extintr" },
 1267                 { .reason = VMCB_EXIT_NMI,      .str = "nmi" },
 1268                 { .reason = VMCB_EXIT_VINTR,    .str = "vintr" },
 1269                 { .reason = VMCB_EXIT_MSR,      .str = "msr" },
 1270                 { .reason = VMCB_EXIT_IRET,     .str = "iret" },
 1271                 { .reason = VMCB_EXIT_MONITOR,  .str = "monitor" },
 1272                 { .reason = VMCB_EXIT_MWAIT,    .str = "mwait" },
 1273                 { .reason = VMCB_EXIT_VMRUN,    .str = "vmrun" },
 1274                 { .reason = VMCB_EXIT_VMMCALL,  .str = "vmmcall" },
 1275                 { .reason = VMCB_EXIT_VMLOAD,   .str = "vmload" },
 1276                 { .reason = VMCB_EXIT_VMSAVE,   .str = "vmsave" },
 1277                 { .reason = VMCB_EXIT_STGI,     .str = "stgi" },
 1278                 { .reason = VMCB_EXIT_CLGI,     .str = "clgi" },
 1279                 { .reason = VMCB_EXIT_SKINIT,   .str = "skinit" },
 1280                 { .reason = VMCB_EXIT_ICEBP,    .str = "icebp" },
 1281                 { .reason = VMCB_EXIT_INVD,     .str = "invd" },
 1282                 { .reason = VMCB_EXIT_INVLPGA,  .str = "invlpga" },
 1283         };
 1284 
 1285         for (i = 0; i < nitems(reasons); i++) {
 1286                 if (reasons[i].reason == reason)
 1287                         return (reasons[i].str);
 1288         }
 1289         snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason);
 1290         return (reasonbuf);
 1291 }
 1292 #endif  /* KTR */
 1293 
 1294 /*
 1295  * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs
 1296  * that are due to instruction intercepts as well as MSR and IOIO intercepts
 1297  * and exceptions caused by INT3, INTO and BOUND instructions.
 1298  *
 1299  * Return 1 if the nRIP is valid and 0 otherwise.
 1300  */
 1301 static int
 1302 nrip_valid(uint64_t exitcode)
 1303 {
 1304         switch (exitcode) {
 1305         case 0x00 ... 0x0F:     /* read of CR0 through CR15 */
 1306         case 0x10 ... 0x1F:     /* write of CR0 through CR15 */
 1307         case 0x20 ... 0x2F:     /* read of DR0 through DR15 */
 1308         case 0x30 ... 0x3F:     /* write of DR0 through DR15 */
 1309         case 0x43:              /* INT3 */
 1310         case 0x44:              /* INTO */
 1311         case 0x45:              /* BOUND */
 1312         case 0x65 ... 0x7C:     /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */
 1313         case 0x80 ... 0x8D:     /* VMEXIT_VMRUN ... VMEXIT_XSETBV */
 1314                 return (1);
 1315         default:
 1316                 return (0);
 1317         }
 1318 }
 1319 
 1320 static int
 1321 svm_vmexit(struct svm_softc *svm_sc, struct svm_vcpu *vcpu,
 1322     struct vm_exit *vmexit)
 1323 {
 1324         struct vmcb *vmcb;
 1325         struct vmcb_state *state;
 1326         struct vmcb_ctrl *ctrl;
 1327         struct svm_regctx *ctx;
 1328         uint64_t code, info1, info2, val;
 1329         uint32_t eax, ecx, edx;
 1330         int error __diagused, errcode_valid, handled, idtvec, reflect;
 1331         bool retu;
 1332 
 1333         ctx = svm_get_guest_regctx(vcpu);
 1334         vmcb = svm_get_vmcb(vcpu);
 1335         state = &vmcb->state;
 1336         ctrl = &vmcb->ctrl;
 1337 
 1338         handled = 0;
 1339         code = ctrl->exitcode;
 1340         info1 = ctrl->exitinfo1;
 1341         info2 = ctrl->exitinfo2;
 1342 
 1343         vmexit->exitcode = VM_EXITCODE_BOGUS;
 1344         vmexit->rip = state->rip;
 1345         vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0;
 1346 
 1347         vmm_stat_incr(vcpu->vcpu, VMEXIT_COUNT, 1);
 1348 
 1349         /*
 1350          * #VMEXIT(INVALID) needs to be handled early because the VMCB is
 1351          * in an inconsistent state and can trigger assertions that would
 1352          * never happen otherwise.
 1353          */
 1354         if (code == VMCB_EXIT_INVALID) {
 1355                 vm_exit_svm(vmexit, code, info1, info2);
 1356                 return (0);
 1357         }
 1358 
 1359         KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event "
 1360             "injection valid bit is set %#lx", __func__, ctrl->eventinj));
 1361 
 1362         KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15,
 1363             ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)",
 1364             vmexit->inst_length, code, info1, info2));
 1365 
 1366         svm_update_virqinfo(vcpu);
 1367         svm_save_intinfo(svm_sc, vcpu);
 1368 
 1369         switch (code) {
 1370         case VMCB_EXIT_IRET:
 1371                 /*
 1372                  * Restart execution at "iret" but with the intercept cleared.
 1373                  */
 1374                 vmexit->inst_length = 0;
 1375                 clear_nmi_blocking(vcpu);
 1376                 handled = 1;
 1377                 break;
 1378         case VMCB_EXIT_VINTR:   /* interrupt window exiting */
 1379                 vmm_stat_incr(vcpu->vcpu, VMEXIT_VINTR, 1);
 1380                 handled = 1;
 1381                 break;
 1382         case VMCB_EXIT_INTR:    /* external interrupt */
 1383                 vmm_stat_incr(vcpu->vcpu, VMEXIT_EXTINT, 1);
 1384                 handled = 1;
 1385                 break;
 1386         case VMCB_EXIT_NMI:     /* external NMI */
 1387                 handled = 1;
 1388                 break;
 1389         case 0x40 ... 0x5F:
 1390                 vmm_stat_incr(vcpu->vcpu, VMEXIT_EXCEPTION, 1);
 1391                 reflect = 1;
 1392                 idtvec = code - 0x40;
 1393                 switch (idtvec) {
 1394                 case IDT_MC:
 1395                         /*
 1396                          * Call the machine check handler by hand. Also don't
 1397                          * reflect the machine check back into the guest.
 1398                          */
 1399                         reflect = 0;
 1400                         SVM_CTR0(vcpu, "Vectoring to MCE handler");
 1401                         __asm __volatile("int $18");
 1402                         break;
 1403                 case IDT_PF:
 1404                         error = svm_setreg(vcpu, VM_REG_GUEST_CR2, info2);
 1405                         KASSERT(error == 0, ("%s: error %d updating cr2",
 1406                             __func__, error));
 1407                         /* fallthru */
 1408                 case IDT_NP:
 1409                 case IDT_SS:
 1410                 case IDT_GP:
 1411                 case IDT_AC:
 1412                 case IDT_TS:
 1413                         errcode_valid = 1;
 1414                         break;
 1415 
 1416                 case IDT_DF:
 1417                         errcode_valid = 1;
 1418                         info1 = 0;
 1419                         break;
 1420 
 1421                 case IDT_BP:
 1422                 case IDT_OF:
 1423                 case IDT_BR:
 1424                         /*
 1425                          * The 'nrip' field is populated for INT3, INTO and
 1426                          * BOUND exceptions and this also implies that
 1427                          * 'inst_length' is non-zero.
 1428                          *
 1429                          * Reset 'inst_length' to zero so the guest %rip at
 1430                          * event injection is identical to what it was when
 1431                          * the exception originally happened.
 1432                          */
 1433                         SVM_CTR2(vcpu, "Reset inst_length from %d "
 1434                             "to zero before injecting exception %d",
 1435                             vmexit->inst_length, idtvec);
 1436                         vmexit->inst_length = 0;
 1437                         /* fallthru */
 1438                 default:
 1439                         errcode_valid = 0;
 1440                         info1 = 0;
 1441                         break;
 1442                 }
 1443                 KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) "
 1444                     "when reflecting exception %d into guest",
 1445                     vmexit->inst_length, idtvec));
 1446 
 1447                 if (reflect) {
 1448                         /* Reflect the exception back into the guest */
 1449                         SVM_CTR2(vcpu, "Reflecting exception "
 1450                             "%d/%#x into the guest", idtvec, (int)info1);
 1451                         error = vm_inject_exception(vcpu->vcpu, idtvec,
 1452                             errcode_valid, info1, 0);
 1453                         KASSERT(error == 0, ("%s: vm_inject_exception error %d",
 1454                             __func__, error));
 1455                 }
 1456                 handled = 1;
 1457                 break;
 1458         case VMCB_EXIT_MSR:     /* MSR access. */
 1459                 eax = state->rax;
 1460                 ecx = ctx->sctx_rcx;
 1461                 edx = ctx->sctx_rdx;
 1462                 retu = false;   
 1463 
 1464                 if (info1) {
 1465                         vmm_stat_incr(vcpu->vcpu, VMEXIT_WRMSR, 1);
 1466                         val = (uint64_t)edx << 32 | eax;
 1467                         SVM_CTR2(vcpu, "wrmsr %#x val %#lx", ecx, val);
 1468                         if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) {
 1469                                 vmexit->exitcode = VM_EXITCODE_WRMSR;
 1470                                 vmexit->u.msr.code = ecx;
 1471                                 vmexit->u.msr.wval = val;
 1472                         } else if (!retu) {
 1473                                 handled = 1;
 1474                         } else {
 1475                                 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 1476                                     ("emulate_wrmsr retu with bogus exitcode"));
 1477                         }
 1478                 } else {
 1479                         SVM_CTR1(vcpu, "rdmsr %#x", ecx);
 1480                         vmm_stat_incr(vcpu->vcpu, VMEXIT_RDMSR, 1);
 1481                         if (emulate_rdmsr(vcpu, ecx, &retu)) {
 1482                                 vmexit->exitcode = VM_EXITCODE_RDMSR;
 1483                                 vmexit->u.msr.code = ecx;
 1484                         } else if (!retu) {
 1485                                 handled = 1;
 1486                         } else {
 1487                                 KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
 1488                                     ("emulate_rdmsr retu with bogus exitcode"));
 1489                         }
 1490                 }
 1491                 break;
 1492         case VMCB_EXIT_IO:
 1493                 handled = svm_handle_io(vcpu, vmexit);
 1494                 vmm_stat_incr(vcpu->vcpu, VMEXIT_INOUT, 1);
 1495                 break;
 1496         case VMCB_EXIT_CPUID:
 1497                 vmm_stat_incr(vcpu->vcpu, VMEXIT_CPUID, 1);
 1498                 handled = x86_emulate_cpuid(vcpu->vcpu,
 1499                     &state->rax, &ctx->sctx_rbx, &ctx->sctx_rcx,
 1500                     &ctx->sctx_rdx);
 1501                 break;
 1502         case VMCB_EXIT_HLT:
 1503                 vmm_stat_incr(vcpu->vcpu, VMEXIT_HLT, 1);
 1504                 vmexit->exitcode = VM_EXITCODE_HLT;
 1505                 vmexit->u.hlt.rflags = state->rflags;
 1506                 break;
 1507         case VMCB_EXIT_PAUSE:
 1508                 vmexit->exitcode = VM_EXITCODE_PAUSE;
 1509                 vmm_stat_incr(vcpu->vcpu, VMEXIT_PAUSE, 1);
 1510                 break;
 1511         case VMCB_EXIT_NPF:
 1512                 /* EXITINFO2 contains the faulting guest physical address */
 1513                 if (info1 & VMCB_NPF_INFO1_RSV) {
 1514                         SVM_CTR2(vcpu, "nested page fault with "
 1515                             "reserved bits set: info1(%#lx) info2(%#lx)",
 1516                             info1, info2);
 1517                 } else if (vm_mem_allocated(vcpu->vcpu, info2)) {
 1518                         vmexit->exitcode = VM_EXITCODE_PAGING;
 1519                         vmexit->u.paging.gpa = info2;
 1520                         vmexit->u.paging.fault_type = npf_fault_type(info1);
 1521                         vmm_stat_incr(vcpu->vcpu, VMEXIT_NESTED_FAULT, 1);
 1522                         SVM_CTR3(vcpu, "nested page fault "
 1523                             "on gpa %#lx/%#lx at rip %#lx",
 1524                             info2, info1, state->rip);
 1525                 } else if (svm_npf_emul_fault(info1)) {
 1526                         svm_handle_inst_emul(vmcb, info2, vmexit);
 1527                         vmm_stat_incr(vcpu->vcpu, VMEXIT_INST_EMUL, 1);
 1528                         SVM_CTR3(vcpu, "inst_emul fault "
 1529                             "for gpa %#lx/%#lx at rip %#lx",
 1530                             info2, info1, state->rip);
 1531                 }
 1532                 break;
 1533         case VMCB_EXIT_MONITOR:
 1534                 vmexit->exitcode = VM_EXITCODE_MONITOR;
 1535                 break;
 1536         case VMCB_EXIT_MWAIT:
 1537                 vmexit->exitcode = VM_EXITCODE_MWAIT;
 1538                 break;
 1539         case VMCB_EXIT_SHUTDOWN:
 1540         case VMCB_EXIT_VMRUN:
 1541         case VMCB_EXIT_VMMCALL:
 1542         case VMCB_EXIT_VMLOAD:
 1543         case VMCB_EXIT_VMSAVE:
 1544         case VMCB_EXIT_STGI:
 1545         case VMCB_EXIT_CLGI:
 1546         case VMCB_EXIT_SKINIT:
 1547         case VMCB_EXIT_ICEBP:
 1548         case VMCB_EXIT_INVLPGA:
 1549                 vm_inject_ud(vcpu->vcpu);
 1550                 handled = 1;
 1551                 break;
 1552         case VMCB_EXIT_INVD:
 1553         case VMCB_EXIT_WBINVD:
 1554                 /* ignore exit */
 1555                 handled = 1;
 1556                 break;
 1557         default:
 1558                 vmm_stat_incr(vcpu->vcpu, VMEXIT_UNKNOWN, 1);
 1559                 break;
 1560         }       
 1561 
 1562         SVM_CTR4(vcpu, "%s %s vmexit at %#lx/%d",
 1563             handled ? "handled" : "unhandled", exit_reason_to_str(code),
 1564             vmexit->rip, vmexit->inst_length);
 1565 
 1566         if (handled) {
 1567                 vmexit->rip += vmexit->inst_length;
 1568                 vmexit->inst_length = 0;
 1569                 state->rip = vmexit->rip;
 1570         } else {
 1571                 if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
 1572                         /*
 1573                          * If this VM exit was not claimed by anybody then
 1574                          * treat it as a generic SVM exit.
 1575                          */
 1576                         vm_exit_svm(vmexit, code, info1, info2);
 1577                 } else {
 1578                         /*
 1579                          * The exitcode and collateral have been populated.
 1580                          * The VM exit will be processed further in userland.
 1581                          */
 1582                 }
 1583         }
 1584         return (handled);
 1585 }
 1586 
 1587 static void
 1588 svm_inj_intinfo(struct svm_softc *svm_sc, struct svm_vcpu *vcpu)
 1589 {
 1590         uint64_t intinfo;
 1591 
 1592         if (!vm_entry_intinfo(vcpu->vcpu, &intinfo))
 1593                 return;
 1594 
 1595         KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
 1596             "valid: %#lx", __func__, intinfo));
 1597 
 1598         svm_eventinject(vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
 1599                 VMCB_EXITINTINFO_VECTOR(intinfo),
 1600                 VMCB_EXITINTINFO_EC(intinfo),
 1601                 VMCB_EXITINTINFO_EC_VALID(intinfo));
 1602         vmm_stat_incr(vcpu->vcpu, VCPU_INTINFO_INJECTED, 1);
 1603         SVM_CTR1(vcpu, "Injected entry intinfo: %#lx", intinfo);
 1604 }
 1605 
 1606 /*
 1607  * Inject event to virtual cpu.
 1608  */
 1609 static void
 1610 svm_inj_interrupts(struct svm_softc *sc, struct svm_vcpu *vcpu,
 1611     struct vlapic *vlapic)
 1612 {
 1613         struct vmcb_ctrl *ctrl;
 1614         struct vmcb_state *state;
 1615         uint8_t v_tpr;
 1616         int vector, need_intr_window;
 1617         int extint_pending;
 1618 
 1619         state = svm_get_vmcb_state(vcpu);
 1620         ctrl  = svm_get_vmcb_ctrl(vcpu);
 1621 
 1622         need_intr_window = 0;
 1623 
 1624         if (vcpu->nextrip != state->rip) {
 1625                 ctrl->intr_shadow = 0;
 1626                 SVM_CTR2(vcpu, "Guest interrupt blocking "
 1627                     "cleared due to rip change: %#lx/%#lx",
 1628                     vcpu->nextrip, state->rip);
 1629         }
 1630 
 1631         /*
 1632          * Inject pending events or exceptions for this vcpu.
 1633          *
 1634          * An event might be pending because the previous #VMEXIT happened
 1635          * during event delivery (i.e. ctrl->exitintinfo).
 1636          *
 1637          * An event might also be pending because an exception was injected
 1638          * by the hypervisor (e.g. #PF during instruction emulation).
 1639          */
 1640         svm_inj_intinfo(sc, vcpu);
 1641 
 1642         /* NMI event has priority over interrupts. */
 1643         if (vm_nmi_pending(vcpu->vcpu)) {
 1644                 if (nmi_blocked(vcpu)) {
 1645                         /*
 1646                          * Can't inject another NMI if the guest has not
 1647                          * yet executed an "iret" after the last NMI.
 1648                          */
 1649                         SVM_CTR0(vcpu, "Cannot inject NMI due "
 1650                             "to NMI-blocking");
 1651                 } else if (ctrl->intr_shadow) {
 1652                         /*
 1653                          * Can't inject an NMI if the vcpu is in an intr_shadow.
 1654                          */
 1655                         SVM_CTR0(vcpu, "Cannot inject NMI due to "
 1656                             "interrupt shadow");
 1657                         need_intr_window = 1;
 1658                         goto done;
 1659                 } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 1660                         /*
 1661                          * If there is already an exception/interrupt pending
 1662                          * then defer the NMI until after that.
 1663                          */
 1664                         SVM_CTR1(vcpu, "Cannot inject NMI due to "
 1665                             "eventinj %#lx", ctrl->eventinj);
 1666 
 1667                         /*
 1668                          * Use self-IPI to trigger a VM-exit as soon as
 1669                          * possible after the event injection is completed.
 1670                          *
 1671                          * This works only if the external interrupt exiting
 1672                          * is at a lower priority than the event injection.
 1673                          *
 1674                          * Although not explicitly specified in APMv2 the
 1675                          * relative priorities were verified empirically.
 1676                          */
 1677                         ipi_cpu(curcpu, IPI_AST);       /* XXX vmm_ipinum? */
 1678                 } else {
 1679                         vm_nmi_clear(vcpu->vcpu);
 1680 
 1681                         /* Inject NMI, vector number is not used */
 1682                         svm_eventinject(vcpu, VMCB_EVENTINJ_TYPE_NMI,
 1683                             IDT_NMI, 0, false);
 1684 
 1685                         /* virtual NMI blocking is now in effect */
 1686                         enable_nmi_blocking(vcpu);
 1687 
 1688                         SVM_CTR0(vcpu, "Injecting vNMI");
 1689                 }
 1690         }
 1691 
 1692         extint_pending = vm_extint_pending(vcpu->vcpu);
 1693         if (!extint_pending) {
 1694                 if (!vlapic_pending_intr(vlapic, &vector))
 1695                         goto done;
 1696                 KASSERT(vector >= 16 && vector <= 255,
 1697                     ("invalid vector %d from local APIC", vector));
 1698         } else {
 1699                 /* Ask the legacy pic for a vector to inject */
 1700                 vatpic_pending_intr(sc->vm, &vector);
 1701                 KASSERT(vector >= 0 && vector <= 255,
 1702                     ("invalid vector %d from INTR", vector));
 1703         }
 1704 
 1705         /*
 1706          * If the guest has disabled interrupts or is in an interrupt shadow
 1707          * then we cannot inject the pending interrupt.
 1708          */
 1709         if ((state->rflags & PSL_I) == 0) {
 1710                 SVM_CTR2(vcpu, "Cannot inject vector %d due to "
 1711                     "rflags %#lx", vector, state->rflags);
 1712                 need_intr_window = 1;
 1713                 goto done;
 1714         }
 1715 
 1716         if (ctrl->intr_shadow) {
 1717                 SVM_CTR1(vcpu, "Cannot inject vector %d due to "
 1718                     "interrupt shadow", vector);
 1719                 need_intr_window = 1;
 1720                 goto done;
 1721         }
 1722 
 1723         if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
 1724                 SVM_CTR2(vcpu, "Cannot inject vector %d due to "
 1725                     "eventinj %#lx", vector, ctrl->eventinj);
 1726                 need_intr_window = 1;
 1727                 goto done;
 1728         }
 1729 
 1730         svm_eventinject(vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
 1731 
 1732         if (!extint_pending) {
 1733                 vlapic_intr_accepted(vlapic, vector);
 1734         } else {
 1735                 vm_extint_clear(vcpu->vcpu);
 1736                 vatpic_intr_accepted(sc->vm, vector);
 1737         }
 1738 
 1739         /*
 1740          * Force a VM-exit as soon as the vcpu is ready to accept another
 1741          * interrupt. This is done because the PIC might have another vector
 1742          * that it wants to inject. Also, if the APIC has a pending interrupt
 1743          * that was preempted by the ExtInt then it allows us to inject the
 1744          * APIC vector as soon as possible.
 1745          */
 1746         need_intr_window = 1;
 1747 done:
 1748         /*
 1749          * The guest can modify the TPR by writing to %CR8. In guest mode
 1750          * the processor reflects this write to V_TPR without hypervisor
 1751          * intervention.
 1752          *
 1753          * The guest can also modify the TPR by writing to it via the memory
 1754          * mapped APIC page. In this case, the write will be emulated by the
 1755          * hypervisor. For this reason V_TPR must be updated before every
 1756          * VMRUN.
 1757          */
 1758         v_tpr = vlapic_get_cr8(vlapic);
 1759         KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr));
 1760         if (ctrl->v_tpr != v_tpr) {
 1761                 SVM_CTR2(vcpu, "VMCB V_TPR changed from %#x to %#x",
 1762                     ctrl->v_tpr, v_tpr);
 1763                 ctrl->v_tpr = v_tpr;
 1764                 svm_set_dirty(vcpu, VMCB_CACHE_TPR);
 1765         }
 1766 
 1767         if (need_intr_window) {
 1768                 /*
 1769                  * We use V_IRQ in conjunction with the VINTR intercept to
 1770                  * trap into the hypervisor as soon as a virtual interrupt
 1771                  * can be delivered.
 1772                  *
 1773                  * Since injected events are not subject to intercept checks
 1774                  * we need to ensure that the V_IRQ is not actually going to
 1775                  * be delivered on VM entry. The KASSERT below enforces this.
 1776                  */
 1777                 KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
 1778                     (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
 1779                     ("Bogus intr_window_exiting: eventinj (%#lx), "
 1780                     "intr_shadow (%u), rflags (%#lx)",
 1781                     ctrl->eventinj, ctrl->intr_shadow, state->rflags));
 1782                 enable_intr_window_exiting(vcpu);
 1783         } else {
 1784                 disable_intr_window_exiting(vcpu);
 1785         }
 1786 }
 1787 
 1788 static __inline void
 1789 restore_host_tss(void)
 1790 {
 1791         struct system_segment_descriptor *tss_sd;
 1792 
 1793         /*
 1794          * The TSS descriptor was in use prior to launching the guest so it
 1795          * has been marked busy.
 1796          *
 1797          * 'ltr' requires the descriptor to be marked available so change the
 1798          * type to "64-bit available TSS".
 1799          */
 1800         tss_sd = PCPU_GET(tss);
 1801         tss_sd->sd_type = SDT_SYSTSS;
 1802         ltr(GSEL(GPROC0_SEL, SEL_KPL));
 1803 }
 1804 
 1805 static void
 1806 svm_pmap_activate(struct svm_vcpu *vcpu, pmap_t pmap)
 1807 {
 1808         struct vmcb_ctrl *ctrl;
 1809         long eptgen;
 1810         int cpu;
 1811         bool alloc_asid;
 1812 
 1813         cpu = curcpu;
 1814         CPU_SET_ATOMIC(cpu, &pmap->pm_active);
 1815         smr_enter(pmap->pm_eptsmr);
 1816 
 1817         ctrl = svm_get_vmcb_ctrl(vcpu);
 1818 
 1819         /*
 1820          * The TLB entries associated with the vcpu's ASID are not valid
 1821          * if either of the following conditions is true:
 1822          *
 1823          * 1. The vcpu's ASID generation is different than the host cpu's
 1824          *    ASID generation. This happens when the vcpu migrates to a new
 1825          *    host cpu. It can also happen when the number of vcpus executing
 1826          *    on a host cpu is greater than the number of ASIDs available.
 1827          *
 1828          * 2. The pmap generation number is different than the value cached in
 1829          *    the 'vcpustate'. This happens when the host invalidates pages
 1830          *    belonging to the guest.
 1831          *
 1832          *      asidgen         eptgen        Action
 1833          *      mismatch        mismatch
 1834          *         0               0            (a)
 1835          *         0               1            (b1) or (b2)
 1836          *         1               0            (c)
 1837          *         1               1            (d)
 1838          *
 1839          * (a) There is no mismatch in eptgen or ASID generation and therefore
 1840          *     no further action is needed.
 1841          *
 1842          * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is
 1843          *      retained and the TLB entries associated with this ASID
 1844          *      are flushed by VMRUN.
 1845          *
 1846          * (b2) If the cpu does not support FlushByAsid then a new ASID is
 1847          *      allocated.
 1848          *
 1849          * (c) A new ASID is allocated.
 1850          *
 1851          * (d) A new ASID is allocated.
 1852          */
 1853 
 1854         alloc_asid = false;
 1855         eptgen = atomic_load_long(&pmap->pm_eptgen);
 1856         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING;
 1857 
 1858         if (vcpu->asid.gen != asid[cpu].gen) {
 1859                 alloc_asid = true;      /* (c) and (d) */
 1860         } else if (vcpu->eptgen != eptgen) {
 1861                 if (flush_by_asid())
 1862                         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;  /* (b1) */
 1863                 else
 1864                         alloc_asid = true;                      /* (b2) */
 1865         } else {
 1866                 /*
 1867                  * This is the common case (a).
 1868                  */
 1869                 KASSERT(!alloc_asid, ("ASID allocation not necessary"));
 1870                 KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING,
 1871                     ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl));
 1872         }
 1873 
 1874         if (alloc_asid) {
 1875                 if (++asid[cpu].num >= nasid) {
 1876                         asid[cpu].num = 1;
 1877                         if (++asid[cpu].gen == 0)
 1878                                 asid[cpu].gen = 1;
 1879                         /*
 1880                          * If this cpu does not support "flush-by-asid"
 1881                          * then flush the entire TLB on a generation
 1882                          * bump. Subsequent ASID allocation in this
 1883                          * generation can be done without a TLB flush.
 1884                          */
 1885                         if (!flush_by_asid())
 1886                                 ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL;
 1887                 }
 1888                 vcpu->asid.gen = asid[cpu].gen;
 1889                 vcpu->asid.num = asid[cpu].num;
 1890 
 1891                 ctrl->asid = vcpu->asid.num;
 1892                 svm_set_dirty(vcpu, VMCB_CACHE_ASID);
 1893                 /*
 1894                  * If this cpu supports "flush-by-asid" then the TLB
 1895                  * was not flushed after the generation bump. The TLB
 1896                  * is flushed selectively after every new ASID allocation.
 1897                  */
 1898                 if (flush_by_asid())
 1899                         ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST;
 1900         }
 1901         vcpu->eptgen = eptgen;
 1902 
 1903         KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero"));
 1904         KASSERT(ctrl->asid == vcpu->asid.num,
 1905             ("ASID mismatch: %u/%u", ctrl->asid, vcpu->asid.num));
 1906 }
 1907 
 1908 static void
 1909 svm_pmap_deactivate(pmap_t pmap)
 1910 {
 1911         smr_exit(pmap->pm_eptsmr);
 1912         CPU_CLR_ATOMIC(curcpu, &pmap->pm_active);
 1913 }
 1914 
 1915 static __inline void
 1916 disable_gintr(void)
 1917 {
 1918 
 1919         __asm __volatile("clgi");
 1920 }
 1921 
 1922 static __inline void
 1923 enable_gintr(void)
 1924 {
 1925 
 1926         __asm __volatile("stgi");
 1927 }
 1928 
 1929 static __inline void
 1930 svm_dr_enter_guest(struct svm_regctx *gctx)
 1931 {
 1932 
 1933         /* Save host control debug registers. */
 1934         gctx->host_dr7 = rdr7();
 1935         gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR);
 1936 
 1937         /*
 1938          * Disable debugging in DR7 and DEBUGCTL to avoid triggering
 1939          * exceptions in the host based on the guest DRx values.  The
 1940          * guest DR6, DR7, and DEBUGCTL are saved/restored in the
 1941          * VMCB.
 1942          */
 1943         load_dr7(0);
 1944         wrmsr(MSR_DEBUGCTLMSR, 0);
 1945 
 1946         /* Save host debug registers. */
 1947         gctx->host_dr0 = rdr0();
 1948         gctx->host_dr1 = rdr1();
 1949         gctx->host_dr2 = rdr2();
 1950         gctx->host_dr3 = rdr3();
 1951         gctx->host_dr6 = rdr6();
 1952 
 1953         /* Restore guest debug registers. */
 1954         load_dr0(gctx->sctx_dr0);
 1955         load_dr1(gctx->sctx_dr1);
 1956         load_dr2(gctx->sctx_dr2);
 1957         load_dr3(gctx->sctx_dr3);
 1958 }
 1959 
 1960 static __inline void
 1961 svm_dr_leave_guest(struct svm_regctx *gctx)
 1962 {
 1963 
 1964         /* Save guest debug registers. */
 1965         gctx->sctx_dr0 = rdr0();
 1966         gctx->sctx_dr1 = rdr1();
 1967         gctx->sctx_dr2 = rdr2();
 1968         gctx->sctx_dr3 = rdr3();
 1969 
 1970         /*
 1971          * Restore host debug registers.  Restore DR7 and DEBUGCTL
 1972          * last.
 1973          */
 1974         load_dr0(gctx->host_dr0);
 1975         load_dr1(gctx->host_dr1);
 1976         load_dr2(gctx->host_dr2);
 1977         load_dr3(gctx->host_dr3);
 1978         load_dr6(gctx->host_dr6);
 1979         wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl);
 1980         load_dr7(gctx->host_dr7);
 1981 }
 1982 
 1983 /*
 1984  * Start vcpu with specified RIP.
 1985  */
 1986 static int
 1987 svm_run(void *vcpui, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo)
 1988 {
 1989         struct svm_regctx *gctx;
 1990         struct svm_softc *svm_sc;
 1991         struct svm_vcpu *vcpu;
 1992         struct vmcb_state *state;
 1993         struct vmcb_ctrl *ctrl;
 1994         struct vm_exit *vmexit;
 1995         struct vlapic *vlapic;
 1996         uint64_t vmcb_pa;
 1997         int handled;
 1998         uint16_t ldt_sel;
 1999 
 2000         vcpu = vcpui;
 2001         svm_sc = vcpu->sc;
 2002         state = svm_get_vmcb_state(vcpu);
 2003         ctrl = svm_get_vmcb_ctrl(vcpu);
 2004         vmexit = vm_exitinfo(vcpu->vcpu);
 2005         vlapic = vm_lapic(vcpu->vcpu);
 2006 
 2007         gctx = svm_get_guest_regctx(vcpu);
 2008         vmcb_pa = vcpu->vmcb_pa;
 2009 
 2010         if (vcpu->lastcpu != curcpu) {
 2011                 /*
 2012                  * Force new ASID allocation by invalidating the generation.
 2013                  */
 2014                 vcpu->asid.gen = 0;
 2015 
 2016                 /*
 2017                  * Invalidate the VMCB state cache by marking all fields dirty.
 2018                  */
 2019                 svm_set_dirty(vcpu, 0xffffffff);
 2020 
 2021                 /*
 2022                  * XXX
 2023                  * Setting 'vcpu->lastcpu' here is bit premature because
 2024                  * we may return from this function without actually executing
 2025                  * the VMRUN  instruction. This could happen if a rendezvous
 2026                  * or an AST is pending on the first time through the loop.
 2027                  *
 2028                  * This works for now but any new side-effects of vcpu
 2029                  * migration should take this case into account.
 2030                  */
 2031                 vcpu->lastcpu = curcpu;
 2032                 vmm_stat_incr(vcpu->vcpu, VCPU_MIGRATIONS, 1);
 2033         }
 2034 
 2035         svm_msr_guest_enter(vcpu);
 2036 
 2037         /* Update Guest RIP */
 2038         state->rip = rip;
 2039 
 2040         do {
 2041                 /*
 2042                  * Disable global interrupts to guarantee atomicity during
 2043                  * loading of guest state. This includes not only the state
 2044                  * loaded by the "vmrun" instruction but also software state
 2045                  * maintained by the hypervisor: suspended and rendezvous
 2046                  * state, NPT generation number, vlapic interrupts etc.
 2047                  */
 2048                 disable_gintr();
 2049 
 2050                 if (vcpu_suspended(evinfo)) {
 2051                         enable_gintr();
 2052                         vm_exit_suspended(vcpu->vcpu, state->rip);
 2053                         break;
 2054                 }
 2055 
 2056                 if (vcpu_rendezvous_pending(evinfo)) {
 2057                         enable_gintr();
 2058                         vm_exit_rendezvous(vcpu->vcpu, state->rip);
 2059                         break;
 2060                 }
 2061 
 2062                 if (vcpu_reqidle(evinfo)) {
 2063                         enable_gintr();
 2064                         vm_exit_reqidle(vcpu->vcpu, state->rip);
 2065                         break;
 2066                 }
 2067 
 2068                 /* We are asked to give the cpu by scheduler. */
 2069                 if (vcpu_should_yield(vcpu->vcpu)) {
 2070                         enable_gintr();
 2071                         vm_exit_astpending(vcpu->vcpu, state->rip);
 2072                         break;
 2073                 }
 2074 
 2075                 if (vcpu_debugged(vcpu->vcpu)) {
 2076                         enable_gintr();
 2077                         vm_exit_debug(vcpu->vcpu, state->rip);
 2078                         break;
 2079                 }
 2080 
 2081                 /*
 2082                  * #VMEXIT resumes the host with the guest LDTR, so
 2083                  * save the current LDT selector so it can be restored
 2084                  * after an exit.  The userspace hypervisor probably
 2085                  * doesn't use a LDT, but save and restore it to be
 2086                  * safe.
 2087                  */
 2088                 ldt_sel = sldt();
 2089 
 2090                 svm_inj_interrupts(svm_sc, vcpu, vlapic);
 2091 
 2092                 /*
 2093                  * Check the pmap generation and the ASID generation to
 2094                  * ensure that the vcpu does not use stale TLB mappings.
 2095                  */
 2096                 svm_pmap_activate(vcpu, pmap);
 2097 
 2098                 ctrl->vmcb_clean = vmcb_clean & ~vcpu->dirty;
 2099                 vcpu->dirty = 0;
 2100                 SVM_CTR1(vcpu, "vmcb clean %#x", ctrl->vmcb_clean);
 2101 
 2102                 /* Launch Virtual Machine. */
 2103                 SVM_CTR1(vcpu, "Resume execution at %#lx", state->rip);
 2104                 svm_dr_enter_guest(gctx);
 2105                 svm_launch(vmcb_pa, gctx, get_pcpu());
 2106                 svm_dr_leave_guest(gctx);
 2107 
 2108                 svm_pmap_deactivate(pmap);
 2109 
 2110                 /*
 2111                  * The host GDTR and IDTR is saved by VMRUN and restored
 2112                  * automatically on #VMEXIT. However, the host TSS needs
 2113                  * to be restored explicitly.
 2114                  */
 2115                 restore_host_tss();
 2116 
 2117                 /* Restore host LDTR. */
 2118                 lldt(ldt_sel);
 2119 
 2120                 /* #VMEXIT disables interrupts so re-enable them here. */ 
 2121                 enable_gintr();
 2122 
 2123                 /* Update 'nextrip' */
 2124                 vcpu->nextrip = state->rip;
 2125 
 2126                 /* Handle #VMEXIT and if required return to user space. */
 2127                 handled = svm_vmexit(svm_sc, vcpu, vmexit);
 2128         } while (handled);
 2129 
 2130         svm_msr_guest_exit(vcpu);
 2131 
 2132         return (0);
 2133 }
 2134 
 2135 static void
 2136 svm_vcpu_cleanup(void *vcpui)
 2137 {
 2138         struct svm_vcpu *vcpu = vcpui;
 2139 
 2140         free(vcpu->vmcb, M_SVM);
 2141         free(vcpu, M_SVM);
 2142 }
 2143 
 2144 static void
 2145 svm_cleanup(void *vmi)
 2146 {
 2147         struct svm_softc *sc = vmi;
 2148 
 2149         contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM);
 2150         contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM);
 2151         free(sc, M_SVM);
 2152 }
 2153 
 2154 static register_t *
 2155 swctx_regptr(struct svm_regctx *regctx, int reg)
 2156 {
 2157 
 2158         switch (reg) {
 2159         case VM_REG_GUEST_RBX:
 2160                 return (&regctx->sctx_rbx);
 2161         case VM_REG_GUEST_RCX:
 2162                 return (&regctx->sctx_rcx);
 2163         case VM_REG_GUEST_RDX:
 2164                 return (&regctx->sctx_rdx);
 2165         case VM_REG_GUEST_RDI:
 2166                 return (&regctx->sctx_rdi);
 2167         case VM_REG_GUEST_RSI:
 2168                 return (&regctx->sctx_rsi);
 2169         case VM_REG_GUEST_RBP:
 2170                 return (&regctx->sctx_rbp);
 2171         case VM_REG_GUEST_R8:
 2172                 return (&regctx->sctx_r8);
 2173         case VM_REG_GUEST_R9:
 2174                 return (&regctx->sctx_r9);
 2175         case VM_REG_GUEST_R10:
 2176                 return (&regctx->sctx_r10);
 2177         case VM_REG_GUEST_R11:
 2178                 return (&regctx->sctx_r11);
 2179         case VM_REG_GUEST_R12:
 2180                 return (&regctx->sctx_r12);
 2181         case VM_REG_GUEST_R13:
 2182                 return (&regctx->sctx_r13);
 2183         case VM_REG_GUEST_R14:
 2184                 return (&regctx->sctx_r14);
 2185         case VM_REG_GUEST_R15:
 2186                 return (&regctx->sctx_r15);
 2187         case VM_REG_GUEST_DR0:
 2188                 return (&regctx->sctx_dr0);
 2189         case VM_REG_GUEST_DR1:
 2190                 return (&regctx->sctx_dr1);
 2191         case VM_REG_GUEST_DR2:
 2192                 return (&regctx->sctx_dr2);
 2193         case VM_REG_GUEST_DR3:
 2194                 return (&regctx->sctx_dr3);
 2195         default:
 2196                 return (NULL);
 2197         }
 2198 }
 2199 
 2200 static int
 2201 svm_getreg(void *vcpui, int ident, uint64_t *val)
 2202 {
 2203         struct svm_vcpu *vcpu;
 2204         register_t *reg;
 2205 
 2206         vcpu = vcpui;
 2207 
 2208         if (ident == VM_REG_GUEST_INTR_SHADOW) {
 2209                 return (svm_get_intr_shadow(vcpu, val));
 2210         }
 2211 
 2212         if (vmcb_read(vcpu, ident, val) == 0) {
 2213                 return (0);
 2214         }
 2215 
 2216         reg = swctx_regptr(svm_get_guest_regctx(vcpu), ident);
 2217 
 2218         if (reg != NULL) {
 2219                 *val = *reg;
 2220                 return (0);
 2221         }
 2222 
 2223         SVM_CTR1(vcpu, "svm_getreg: unknown register %#x", ident);
 2224         return (EINVAL);
 2225 }
 2226 
 2227 static int
 2228 svm_setreg(void *vcpui, int ident, uint64_t val)
 2229 {
 2230         struct svm_vcpu *vcpu;
 2231         register_t *reg;
 2232 
 2233         vcpu = vcpui;
 2234 
 2235         if (ident == VM_REG_GUEST_INTR_SHADOW) {
 2236                 return (svm_modify_intr_shadow(vcpu, val));
 2237         }
 2238 
 2239         /* Do not permit user write access to VMCB fields by offset. */
 2240         if (!VMCB_ACCESS_OK(ident)) {
 2241                 if (vmcb_write(vcpu, ident, val) == 0) {
 2242                         return (0);
 2243                 }
 2244         }
 2245 
 2246         reg = swctx_regptr(svm_get_guest_regctx(vcpu), ident);
 2247 
 2248         if (reg != NULL) {
 2249                 *reg = val;
 2250                 return (0);
 2251         }
 2252 
 2253         if (ident == VM_REG_GUEST_ENTRY_INST_LENGTH) {
 2254                 /* Ignore. */
 2255                 return (0);
 2256         }
 2257 
 2258         /*
 2259          * XXX deal with CR3 and invalidate TLB entries tagged with the
 2260          * vcpu's ASID. This needs to be treated differently depending on
 2261          * whether 'running' is true/false.
 2262          */
 2263 
 2264         SVM_CTR1(vcpu, "svm_setreg: unknown register %#x", ident);
 2265         return (EINVAL);
 2266 }
 2267 
 2268 static int
 2269 svm_getdesc(void *vcpui, int reg, struct seg_desc *desc)
 2270 {
 2271         return (vmcb_getdesc(vcpui, reg, desc));
 2272 }
 2273 
 2274 static int
 2275 svm_setdesc(void *vcpui, int reg, struct seg_desc *desc)
 2276 {
 2277         return (vmcb_setdesc(vcpui, reg, desc));
 2278 }
 2279 
 2280 #ifdef BHYVE_SNAPSHOT
 2281 static int
 2282 svm_snapshot_reg(void *vcpui, int ident, struct vm_snapshot_meta *meta)
 2283 {
 2284         int ret;
 2285         uint64_t val;
 2286 
 2287         if (meta->op == VM_SNAPSHOT_SAVE) {
 2288                 ret = svm_getreg(vcpui, ident, &val);
 2289                 if (ret != 0)
 2290                         goto done;
 2291 
 2292                 SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
 2293         } else if (meta->op == VM_SNAPSHOT_RESTORE) {
 2294                 SNAPSHOT_VAR_OR_LEAVE(val, meta, ret, done);
 2295 
 2296                 ret = svm_setreg(vcpui, ident, val);
 2297                 if (ret != 0)
 2298                         goto done;
 2299         } else {
 2300                 ret = EINVAL;
 2301                 goto done;
 2302         }
 2303 
 2304 done:
 2305         return (ret);
 2306 }
 2307 #endif
 2308 
 2309 static int
 2310 svm_setcap(void *vcpui, int type, int val)
 2311 {
 2312         struct svm_vcpu *vcpu;
 2313         struct vlapic *vlapic;
 2314         int error;
 2315 
 2316         vcpu = vcpui;
 2317         error = 0;
 2318 
 2319         switch (type) {
 2320         case VM_CAP_HALT_EXIT:
 2321                 svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT,
 2322                     VMCB_INTCPT_HLT, val);
 2323                 break;
 2324         case VM_CAP_PAUSE_EXIT:
 2325                 svm_set_intercept(vcpu, VMCB_CTRL1_INTCPT,
 2326                     VMCB_INTCPT_PAUSE, val);
 2327                 break;
 2328         case VM_CAP_UNRESTRICTED_GUEST:
 2329                 /* Unrestricted guest execution cannot be disabled in SVM */
 2330                 if (val == 0)
 2331                         error = EINVAL;
 2332                 break;
 2333         case VM_CAP_IPI_EXIT:
 2334                 vlapic = vm_lapic(vcpu->vcpu);
 2335                 vlapic->ipi_exit = val;
 2336                 break;
 2337         default:
 2338                 error = ENOENT;
 2339                 break;
 2340         }
 2341         return (error);
 2342 }
 2343 
 2344 static int
 2345 svm_getcap(void *vcpui, int type, int *retval)
 2346 {
 2347         struct svm_vcpu *vcpu;
 2348         struct vlapic *vlapic;
 2349         int error;
 2350 
 2351         vcpu = vcpui;
 2352         error = 0;
 2353 
 2354         switch (type) {
 2355         case VM_CAP_HALT_EXIT:
 2356                 *retval = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT,
 2357                     VMCB_INTCPT_HLT);
 2358                 break;
 2359         case VM_CAP_PAUSE_EXIT:
 2360                 *retval = svm_get_intercept(vcpu, VMCB_CTRL1_INTCPT,
 2361                     VMCB_INTCPT_PAUSE);
 2362                 break;
 2363         case VM_CAP_UNRESTRICTED_GUEST:
 2364                 *retval = 1;    /* unrestricted guest is always enabled */
 2365                 break;
 2366         case VM_CAP_IPI_EXIT:
 2367                 vlapic = vm_lapic(vcpu->vcpu);
 2368                 *retval = vlapic->ipi_exit;
 2369                 break;
 2370         default:
 2371                 error = ENOENT;
 2372                 break;
 2373         }
 2374         return (error);
 2375 }
 2376 
 2377 static struct vmspace *
 2378 svm_vmspace_alloc(vm_offset_t min, vm_offset_t max)
 2379 {
 2380         return (svm_npt_alloc(min, max));
 2381 }
 2382 
 2383 static void
 2384 svm_vmspace_free(struct vmspace *vmspace)
 2385 {
 2386         svm_npt_free(vmspace);
 2387 }
 2388 
 2389 static struct vlapic *
 2390 svm_vlapic_init(void *vcpui)
 2391 {
 2392         struct svm_vcpu *vcpu;
 2393         struct vlapic *vlapic;
 2394 
 2395         vcpu = vcpui;
 2396         vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO);
 2397         vlapic->vm = vcpu->sc->vm;
 2398         vlapic->vcpu = vcpu->vcpu;
 2399         vlapic->vcpuid = vcpu->vcpuid;
 2400         vlapic->apic_page = malloc_aligned(PAGE_SIZE, PAGE_SIZE, M_SVM_VLAPIC,
 2401             M_WAITOK | M_ZERO);
 2402 
 2403         vlapic_init(vlapic);
 2404 
 2405         return (vlapic);
 2406 }
 2407 
 2408 static void
 2409 svm_vlapic_cleanup(struct vlapic *vlapic)
 2410 {
 2411 
 2412         vlapic_cleanup(vlapic);
 2413         free(vlapic->apic_page, M_SVM_VLAPIC);
 2414         free(vlapic, M_SVM_VLAPIC);
 2415 }
 2416 
 2417 #ifdef BHYVE_SNAPSHOT
 2418 static int
 2419 svm_snapshot(void *vmi, struct vm_snapshot_meta *meta)
 2420 {
 2421         if (meta->op == VM_SNAPSHOT_RESTORE)
 2422                 flush_by_asid();
 2423 
 2424         return (0);
 2425 }
 2426 
 2427 static int
 2428 svm_vcpu_snapshot(void *vcpui, struct vm_snapshot_meta *meta)
 2429 {
 2430         struct svm_vcpu *vcpu;
 2431         int err, running, hostcpu;
 2432 
 2433         vcpu = vcpui;
 2434         err = 0;
 2435 
 2436         running = vcpu_is_running(vcpu->vcpu, &hostcpu);
 2437         if (running && hostcpu != curcpu) {
 2438                 printf("%s: %s%d is running", __func__, vm_name(vcpu->sc->vm),
 2439                     vcpu->vcpuid);
 2440                 return (EINVAL);
 2441         }
 2442 
 2443         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR0, meta);
 2444         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR2, meta);
 2445         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR3, meta);
 2446         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CR4, meta);
 2447 
 2448         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DR6, meta);
 2449         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DR7, meta);
 2450 
 2451         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RAX, meta);
 2452 
 2453         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RSP, meta);
 2454         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RIP, meta);
 2455         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_RFLAGS, meta);
 2456 
 2457         /* Guest segments */
 2458         /* ES */
 2459         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_ES, meta);
 2460         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_ES, meta);
 2461 
 2462         /* CS */
 2463         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_CS, meta);
 2464         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_CS, meta);
 2465 
 2466         /* SS */
 2467         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_SS, meta);
 2468         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_SS, meta);
 2469 
 2470         /* DS */
 2471         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_DS, meta);
 2472         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_DS, meta);
 2473 
 2474         /* FS */
 2475         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_FS, meta);
 2476         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_FS, meta);
 2477 
 2478         /* GS */
 2479         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_GS, meta);
 2480         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_GS, meta);
 2481 
 2482         /* TR */
 2483         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_TR, meta);
 2484         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_TR, meta);
 2485 
 2486         /* LDTR */
 2487         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_LDTR, meta);
 2488         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_LDTR, meta);
 2489 
 2490         /* EFER */
 2491         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_EFER, meta);
 2492 
 2493         /* IDTR and GDTR */
 2494         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_IDTR, meta);
 2495         err += vmcb_snapshot_desc(vcpu, VM_REG_GUEST_GDTR, meta);
 2496 
 2497         /* Specific AMD registers */
 2498         err += svm_snapshot_reg(vcpu, VM_REG_GUEST_INTR_SHADOW, meta);
 2499 
 2500         err += vmcb_snapshot_any(vcpu,
 2501                                 VMCB_ACCESS(VMCB_OFF_CR_INTERCEPT, 4), meta);
 2502         err += vmcb_snapshot_any(vcpu,
 2503                                 VMCB_ACCESS(VMCB_OFF_DR_INTERCEPT, 4), meta);
 2504         err += vmcb_snapshot_any(vcpu,
 2505                                 VMCB_ACCESS(VMCB_OFF_EXC_INTERCEPT, 4), meta);
 2506         err += vmcb_snapshot_any(vcpu,
 2507                                 VMCB_ACCESS(VMCB_OFF_INST1_INTERCEPT, 4), meta);
 2508         err += vmcb_snapshot_any(vcpu,
 2509                                 VMCB_ACCESS(VMCB_OFF_INST2_INTERCEPT, 4), meta);
 2510 
 2511         err += vmcb_snapshot_any(vcpu,
 2512                                 VMCB_ACCESS(VMCB_OFF_PAUSE_FILTHRESH, 2), meta);
 2513         err += vmcb_snapshot_any(vcpu,
 2514                                 VMCB_ACCESS(VMCB_OFF_PAUSE_FILCNT, 2), meta);
 2515 
 2516         err += vmcb_snapshot_any(vcpu,
 2517                                 VMCB_ACCESS(VMCB_OFF_ASID, 4), meta);
 2518 
 2519         err += vmcb_snapshot_any(vcpu,
 2520                                 VMCB_ACCESS(VMCB_OFF_TLB_CTRL, 4), meta);
 2521 
 2522         err += vmcb_snapshot_any(vcpu,
 2523                                 VMCB_ACCESS(VMCB_OFF_VIRQ, 8), meta);
 2524 
 2525         err += vmcb_snapshot_any(vcpu,
 2526                                 VMCB_ACCESS(VMCB_OFF_EXIT_REASON, 8), meta);
 2527         err += vmcb_snapshot_any(vcpu,
 2528                                 VMCB_ACCESS(VMCB_OFF_EXITINFO1, 8), meta);
 2529         err += vmcb_snapshot_any(vcpu,
 2530                                 VMCB_ACCESS(VMCB_OFF_EXITINFO2, 8), meta);
 2531         err += vmcb_snapshot_any(vcpu,
 2532                                 VMCB_ACCESS(VMCB_OFF_EXITINTINFO, 8), meta);
 2533 
 2534         err += vmcb_snapshot_any(vcpu,
 2535                                 VMCB_ACCESS(VMCB_OFF_NP_ENABLE, 1), meta);
 2536 
 2537         err += vmcb_snapshot_any(vcpu,
 2538                                 VMCB_ACCESS(VMCB_OFF_AVIC_BAR, 8), meta);
 2539         err += vmcb_snapshot_any(vcpu,
 2540                                 VMCB_ACCESS(VMCB_OFF_AVIC_PAGE, 8), meta);
 2541         err += vmcb_snapshot_any(vcpu,
 2542                                 VMCB_ACCESS(VMCB_OFF_AVIC_LT, 8), meta);
 2543         err += vmcb_snapshot_any(vcpu,
 2544                                 VMCB_ACCESS(VMCB_OFF_AVIC_PT, 8), meta);
 2545 
 2546         err += vmcb_snapshot_any(vcpu,
 2547                                 VMCB_ACCESS(VMCB_OFF_CPL, 1), meta);
 2548 
 2549         err += vmcb_snapshot_any(vcpu,
 2550                                 VMCB_ACCESS(VMCB_OFF_STAR, 8), meta);
 2551         err += vmcb_snapshot_any(vcpu,
 2552                                 VMCB_ACCESS(VMCB_OFF_LSTAR, 8), meta);
 2553         err += vmcb_snapshot_any(vcpu,
 2554                                 VMCB_ACCESS(VMCB_OFF_CSTAR, 8), meta);
 2555 
 2556         err += vmcb_snapshot_any(vcpu,
 2557                                 VMCB_ACCESS(VMCB_OFF_SFMASK, 8), meta);
 2558 
 2559         err += vmcb_snapshot_any(vcpu,
 2560                                 VMCB_ACCESS(VMCB_OFF_KERNELGBASE, 8), meta);
 2561 
 2562         err += vmcb_snapshot_any(vcpu,
 2563                                 VMCB_ACCESS(VMCB_OFF_SYSENTER_CS, 8), meta);
 2564         err += vmcb_snapshot_any(vcpu,
 2565                                 VMCB_ACCESS(VMCB_OFF_SYSENTER_ESP, 8), meta);
 2566         err += vmcb_snapshot_any(vcpu,
 2567                                 VMCB_ACCESS(VMCB_OFF_SYSENTER_EIP, 8), meta);
 2568 
 2569         err += vmcb_snapshot_any(vcpu,
 2570                                 VMCB_ACCESS(VMCB_OFF_GUEST_PAT, 8), meta);
 2571 
 2572         err += vmcb_snapshot_any(vcpu,
 2573                                 VMCB_ACCESS(VMCB_OFF_DBGCTL, 8), meta);
 2574         err += vmcb_snapshot_any(vcpu,
 2575                                 VMCB_ACCESS(VMCB_OFF_BR_FROM, 8), meta);
 2576         err += vmcb_snapshot_any(vcpu,
 2577                                 VMCB_ACCESS(VMCB_OFF_BR_TO, 8), meta);
 2578         err += vmcb_snapshot_any(vcpu,
 2579                                 VMCB_ACCESS(VMCB_OFF_INT_FROM, 8), meta);
 2580         err += vmcb_snapshot_any(vcpu,
 2581                                 VMCB_ACCESS(VMCB_OFF_INT_TO, 8), meta);
 2582         if (err != 0)
 2583                 goto done;
 2584 
 2585         /* Snapshot swctx for virtual cpu */
 2586         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbp, meta, err, done);
 2587         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rbx, meta, err, done);
 2588         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rcx, meta, err, done);
 2589         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdx, meta, err, done);
 2590         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rdi, meta, err, done);
 2591         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_rsi, meta, err, done);
 2592         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r8, meta, err, done);
 2593         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r9, meta, err, done);
 2594         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r10, meta, err, done);
 2595         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r11, meta, err, done);
 2596         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r12, meta, err, done);
 2597         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r13, meta, err, done);
 2598         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r14, meta, err, done);
 2599         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_r15, meta, err, done);
 2600         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr0, meta, err, done);
 2601         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr1, meta, err, done);
 2602         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr2, meta, err, done);
 2603         SNAPSHOT_VAR_OR_LEAVE(vcpu->swctx.sctx_dr3, meta, err, done);
 2604 
 2605         /* Restore other svm_vcpu struct fields */
 2606 
 2607         /* Restore NEXTRIP field */
 2608         SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, err, done);
 2609 
 2610         /* Restore lastcpu field */
 2611         SNAPSHOT_VAR_OR_LEAVE(vcpu->lastcpu, meta, err, done);
 2612         SNAPSHOT_VAR_OR_LEAVE(vcpu->dirty, meta, err, done);
 2613 
 2614         /* Restore EPTGEN field - EPT is Extended Page Table */
 2615         SNAPSHOT_VAR_OR_LEAVE(vcpu->eptgen, meta, err, done);
 2616 
 2617         SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.gen, meta, err, done);
 2618         SNAPSHOT_VAR_OR_LEAVE(vcpu->asid.num, meta, err, done);
 2619 
 2620         /* Set all caches dirty */
 2621         if (meta->op == VM_SNAPSHOT_RESTORE)
 2622                 svm_set_dirty(vcpu, 0xffffffff);
 2623 
 2624 done:
 2625         return (err);
 2626 }
 2627 
 2628 static int
 2629 svm_restore_tsc(void *vcpui, uint64_t offset)
 2630 {
 2631         struct svm_vcpu *vcpu = vcpui;
 2632 
 2633         svm_set_tsc_offset(vcpu, offset);
 2634 
 2635         return (0);
 2636 }
 2637 #endif
 2638 
 2639 const struct vmm_ops vmm_ops_amd = {
 2640         .modinit        = svm_modinit,
 2641         .modcleanup     = svm_modcleanup,
 2642         .modresume      = svm_modresume,
 2643         .init           = svm_init,
 2644         .run            = svm_run,
 2645         .cleanup        = svm_cleanup,
 2646         .vcpu_init      = svm_vcpu_init,
 2647         .vcpu_cleanup   = svm_vcpu_cleanup,
 2648         .getreg         = svm_getreg,
 2649         .setreg         = svm_setreg,
 2650         .getdesc        = svm_getdesc,
 2651         .setdesc        = svm_setdesc,
 2652         .getcap         = svm_getcap,
 2653         .setcap         = svm_setcap,
 2654         .vmspace_alloc  = svm_vmspace_alloc,
 2655         .vmspace_free   = svm_vmspace_free,
 2656         .vlapic_init    = svm_vlapic_init,
 2657         .vlapic_cleanup = svm_vlapic_cleanup,
 2658 #ifdef BHYVE_SNAPSHOT
 2659         .snapshot       = svm_snapshot,
 2660         .vcpu_snapshot  = svm_vcpu_snapshot,
 2661         .restore_tsc    = svm_restore_tsc,
 2662 #endif
 2663 };

Cache object: 5cd65c924927c4d422bc545772931581


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.