vmm.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2011 NetApp, Inc.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  *
   28  * $FreeBSD$
   29  */
   30 
   31 #include <sys/cdefs.h>
   32 __FBSDID("$FreeBSD$");
   33 
   34 #include "opt_bhyve_snapshot.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/systm.h>
   38 #include <sys/kernel.h>
   39 #include <sys/module.h>
   40 #include <sys/sysctl.h>
   41 #include <sys/malloc.h>
   42 #include <sys/pcpu.h>
   43 #include <sys/lock.h>
   44 #include <sys/mutex.h>
   45 #include <sys/proc.h>
   46 #include <sys/rwlock.h>
   47 #include <sys/sched.h>
   48 #include <sys/smp.h>
   49 #include <sys/sx.h>
   50 #include <sys/vnode.h>
   51 
   52 #include <vm/vm.h>
   53 #include <vm/vm_param.h>
   54 #include <vm/vm_extern.h>
   55 #include <vm/vm_object.h>
   56 #include <vm/vm_page.h>
   57 #include <vm/pmap.h>
   58 #include <vm/vm_map.h>
   59 #include <vm/vm_pager.h>
   60 #include <vm/vm_kern.h>
   61 #include <vm/vnode_pager.h>
   62 #include <vm/swap_pager.h>
   63 #include <vm/uma.h>
   64 
   65 #include <machine/cpu.h>
   66 #include <machine/pcb.h>
   67 #include <machine/smp.h>
   68 #include <machine/md_var.h>
   69 #include <x86/psl.h>
   70 #include <x86/apicreg.h>
   71 #include <x86/ifunc.h>
   72 
   73 #include <machine/vmm.h>
   74 #include <machine/vmm_dev.h>
   75 #include <machine/vmm_instruction_emul.h>
   76 #include <machine/vmm_snapshot.h>
   77 
   78 #include "vmm_ioport.h"
   79 #include "vmm_ktr.h"
   80 #include "vmm_host.h"
   81 #include "vmm_mem.h"
   82 #include "vmm_util.h"
   83 #include "vatpic.h"
   84 #include "vatpit.h"
   85 #include "vhpet.h"
   86 #include "vioapic.h"
   87 #include "vlapic.h"
   88 #include "vpmtmr.h"
   89 #include "vrtc.h"
   90 #include "vmm_stat.h"
   91 #include "vmm_lapic.h"
   92 
   93 #include "io/ppt.h"
   94 #include "io/iommu.h"
   95 
   96 struct vlapic;
   97 
   98 /*
   99  * Initialization:
  100  * (a) allocated when vcpu is created
  101  * (i) initialized when vcpu is created and when it is reinitialized
  102  * (o) initialized the first time the vcpu is created
  103  * (x) initialized before use
  104  */
  105 struct vcpu {
  106         struct mtx      mtx;            /* (o) protects 'state' and 'hostcpu' */
  107         enum vcpu_state state;          /* (o) vcpu state */
  108         int             vcpuid;         /* (o) */
  109         int             hostcpu;        /* (o) vcpu's host cpu */
  110         int             reqidle;        /* (i) request vcpu to idle */
  111         struct vm       *vm;            /* (o) */
  112         void            *cookie;        /* (i) cpu-specific data */
  113         struct vlapic   *vlapic;        /* (i) APIC device model */
  114         enum x2apic_state x2apic_state; /* (i) APIC mode */
  115         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
  116         int             nmi_pending;    /* (i) NMI pending */
  117         int             extint_pending; /* (i) INTR pending */
  118         int     exception_pending;      /* (i) exception pending */
  119         int     exc_vector;             /* (x) exception collateral */
  120         int     exc_errcode_valid;
  121         uint32_t exc_errcode;
  122         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
  123         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
  124         void            *stats;         /* (a,i) statistics */
  125         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
  126         uint64_t        nextrip;        /* (x) next instruction to execute */
  127         uint64_t        tsc_offset;     /* (o) TSC offsetting */
  128 };
  129 
  130 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
  131 #define vcpu_lock_destroy(v)    mtx_destroy(&((v)->mtx))
  132 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
  133 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
  134 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
  135 
  136 struct mem_seg {
  137         size_t  len;
  138         bool    sysmem;
  139         struct vm_object *object;
  140 };
  141 #define VM_MAX_MEMSEGS  4
  142 
  143 struct mem_map {
  144         vm_paddr_t      gpa;
  145         size_t          len;
  146         vm_ooffset_t    segoff;
  147         int             segid;
  148         int             prot;
  149         int             flags;
  150 };
  151 #define VM_MAX_MEMMAPS  8
  152 
  153 /*
  154  * Initialization:
  155  * (o) initialized the first time the VM is created
  156  * (i) initialized when VM is created and when it is reinitialized
  157  * (x) initialized before use
  158  *
  159  * Locking:
  160  * [m] mem_segs_lock
  161  * [r] rendezvous_mtx
  162  * [v] reads require one frozen vcpu, writes require freezing all vcpus
  163  */
  164 struct vm {
  165         void            *cookie;                /* (i) cpu-specific data */
  166         void            *iommu;                 /* (x) iommu-specific data */
  167         struct vhpet    *vhpet;                 /* (i) virtual HPET */
  168         struct vioapic  *vioapic;               /* (i) virtual ioapic */
  169         struct vatpic   *vatpic;                /* (i) virtual atpic */
  170         struct vatpit   *vatpit;                /* (i) virtual atpit */
  171         struct vpmtmr   *vpmtmr;                /* (i) virtual ACPI PM timer */
  172         struct vrtc     *vrtc;                  /* (o) virtual RTC */
  173         volatile cpuset_t active_cpus;          /* (i) active vcpus */
  174         volatile cpuset_t debug_cpus;           /* (i) vcpus stopped for debug */
  175         cpuset_t        startup_cpus;           /* (i) [r] waiting for startup */
  176         int             suspend;                /* (i) stop VM execution */
  177         bool            dying;                  /* (o) is dying */
  178         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
  179         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
  180         cpuset_t        rendezvous_req_cpus;    /* (x) [r] rendezvous requested */
  181         cpuset_t        rendezvous_done_cpus;   /* (x) [r] rendezvous finished */
  182         void            *rendezvous_arg;        /* (x) [r] rendezvous func/arg */
  183         vm_rendezvous_func_t rendezvous_func;
  184         struct mtx      rendezvous_mtx;         /* (o) rendezvous lock */
  185         struct mem_map  mem_maps[VM_MAX_MEMMAPS]; /* (i) [m+v] guest address space */
  186         struct mem_seg  mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */
  187         struct vmspace  *vmspace;               /* (o) guest's address space */
  188         char            name[VM_MAX_NAMELEN+1]; /* (o) virtual machine name */
  189         struct vcpu     **vcpu;                 /* (o) guest vcpus */
  190         /* The following describe the vm cpu topology */
  191         uint16_t        sockets;                /* (o) num of sockets */
  192         uint16_t        cores;                  /* (o) num of cores/socket */
  193         uint16_t        threads;                /* (o) num of threads/core */
  194         uint16_t        maxcpus;                /* (o) max pluggable cpus */
  195         struct sx       mem_segs_lock;          /* (o) */
  196         struct sx       vcpus_init_lock;        /* (o) */
  197 };
  198 
  199 #define VMM_CTR0(vcpu, format)                                          \
  200         VCPU_CTR0((vcpu)->vm, (vcpu)->vcpuid, format)
  201 
  202 #define VMM_CTR1(vcpu, format, p1)                                      \
  203         VCPU_CTR1((vcpu)->vm, (vcpu)->vcpuid, format, p1)
  204 
  205 #define VMM_CTR2(vcpu, format, p1, p2)                                  \
  206         VCPU_CTR2((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2)
  207 
  208 #define VMM_CTR3(vcpu, format, p1, p2, p3)                              \
  209         VCPU_CTR3((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3)
  210 
  211 #define VMM_CTR4(vcpu, format, p1, p2, p3, p4)                          \
  212         VCPU_CTR4((vcpu)->vm, (vcpu)->vcpuid, format, p1, p2, p3, p4)
  213 
  214 static int vmm_initialized;
  215 
  216 static void     vmmops_panic(void);
  217 
  218 static void
  219 vmmops_panic(void)
  220 {
  221         panic("vmm_ops func called when !vmm_is_intel() && !vmm_is_svm()");
  222 }
  223 
  224 #define DEFINE_VMMOPS_IFUNC(ret_type, opname, args)                     \
  225     DEFINE_IFUNC(static, ret_type, vmmops_##opname, args)               \
  226     {                                                                   \
  227         if (vmm_is_intel())                                             \
  228                 return (vmm_ops_intel.opname);                          \
  229         else if (vmm_is_svm())                                          \
  230                 return (vmm_ops_amd.opname);                            \
  231         else                                                            \
  232                 return ((ret_type (*)args)vmmops_panic);                \
  233     }
  234 
  235 DEFINE_VMMOPS_IFUNC(int, modinit, (int ipinum))
  236 DEFINE_VMMOPS_IFUNC(int, modcleanup, (void))
  237 DEFINE_VMMOPS_IFUNC(void, modresume, (void))
  238 DEFINE_VMMOPS_IFUNC(void *, init, (struct vm *vm, struct pmap *pmap))
  239 DEFINE_VMMOPS_IFUNC(int, run, (void *vcpui, register_t rip, struct pmap *pmap,
  240     struct vm_eventinfo *info))
  241 DEFINE_VMMOPS_IFUNC(void, cleanup, (void *vmi))
  242 DEFINE_VMMOPS_IFUNC(void *, vcpu_init, (void *vmi, struct vcpu *vcpu,
  243     int vcpu_id))
  244 DEFINE_VMMOPS_IFUNC(void, vcpu_cleanup, (void *vcpui))
  245 DEFINE_VMMOPS_IFUNC(int, getreg, (void *vcpui, int num, uint64_t *retval))
  246 DEFINE_VMMOPS_IFUNC(int, setreg, (void *vcpui, int num, uint64_t val))
  247 DEFINE_VMMOPS_IFUNC(int, getdesc, (void *vcpui, int num, struct seg_desc *desc))
  248 DEFINE_VMMOPS_IFUNC(int, setdesc, (void *vcpui, int num, struct seg_desc *desc))
  249 DEFINE_VMMOPS_IFUNC(int, getcap, (void *vcpui, int num, int *retval))
  250 DEFINE_VMMOPS_IFUNC(int, setcap, (void *vcpui, int num, int val))
  251 DEFINE_VMMOPS_IFUNC(struct vmspace *, vmspace_alloc, (vm_offset_t min,
  252     vm_offset_t max))
  253 DEFINE_VMMOPS_IFUNC(void, vmspace_free, (struct vmspace *vmspace))
  254 DEFINE_VMMOPS_IFUNC(struct vlapic *, vlapic_init, (void *vcpui))
  255 DEFINE_VMMOPS_IFUNC(void, vlapic_cleanup, (struct vlapic *vlapic))
  256 #ifdef BHYVE_SNAPSHOT
  257 DEFINE_VMMOPS_IFUNC(int, snapshot, (void *vmi, struct vm_snapshot_meta *meta))
  258 DEFINE_VMMOPS_IFUNC(int, vcpu_snapshot, (void *vcpui,
  259     struct vm_snapshot_meta *meta))
  260 DEFINE_VMMOPS_IFUNC(int, restore_tsc, (void *vcpui, uint64_t now))
  261 #endif
  262 
  263 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
  264 #define fpu_stop_emulating()    clts()
  265 
  266 SDT_PROVIDER_DEFINE(vmm);
  267 
  268 static MALLOC_DEFINE(M_VM, "vm", "vm");
  269 
  270 /* statistics */
  271 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
  272 
  273 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
  274     NULL);
  275 
  276 /*
  277  * Halt the guest if all vcpus are executing a HLT instruction with
  278  * interrupts disabled.
  279  */
  280 static int halt_detection_enabled = 1;
  281 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
  282     &halt_detection_enabled, 0,
  283     "Halt VM if all vcpus execute HLT with interrupts disabled");
  284 
  285 static int vmm_ipinum;
  286 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
  287     "IPI vector used for vcpu notifications");
  288 
  289 static int trace_guest_exceptions;
  290 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
  291     &trace_guest_exceptions, 0,
  292     "Trap into hypervisor on all guest exceptions and reflect them back");
  293 
  294 static int trap_wbinvd;
  295 SYSCTL_INT(_hw_vmm, OID_AUTO, trap_wbinvd, CTLFLAG_RDTUN, &trap_wbinvd, 0,
  296     "WBINVD triggers a VM-exit");
  297 
  298 u_int vm_maxcpu;
  299 SYSCTL_UINT(_hw_vmm, OID_AUTO, maxcpu, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
  300     &vm_maxcpu, 0, "Maximum number of vCPUs");
  301 
  302 static void vm_free_memmap(struct vm *vm, int ident);
  303 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
  304 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
  305 
  306 /*
  307  * Upper limit on vm_maxcpu.  Limited by use of uint16_t types for CPU
  308  * counts as well as range of vpid values for VT-x and by the capacity
  309  * of cpuset_t masks.  The call to new_unrhdr() in vpid_init() in
  310  * vmx.c requires 'vm_maxcpu + 1 <= 0xffff', hence the '- 1' below.
  311  */
  312 #define VM_MAXCPU       MIN(0xffff - 1, CPU_SETSIZE)
  313 
  314 #ifdef KTR
  315 static const char *
  316 vcpu_state2str(enum vcpu_state state)
  317 {
  318 
  319         switch (state) {
  320         case VCPU_IDLE:
  321                 return ("idle");
  322         case VCPU_FROZEN:
  323                 return ("frozen");
  324         case VCPU_RUNNING:
  325                 return ("running");
  326         case VCPU_SLEEPING:
  327                 return ("sleeping");
  328         default:
  329                 return ("unknown");
  330         }
  331 }
  332 #endif
  333 
  334 static void
  335 vcpu_cleanup(struct vcpu *vcpu, bool destroy)
  336 {
  337         vmmops_vlapic_cleanup(vcpu->vlapic);
  338         vmmops_vcpu_cleanup(vcpu->cookie);
  339         vcpu->cookie = NULL;
  340         if (destroy) {
  341                 vmm_stat_free(vcpu->stats);     
  342                 fpu_save_area_free(vcpu->guestfpu);
  343                 vcpu_lock_destroy(vcpu);
  344                 free(vcpu, M_VM);
  345         }
  346 }
  347 
  348 static struct vcpu *
  349 vcpu_alloc(struct vm *vm, int vcpu_id)
  350 {
  351         struct vcpu *vcpu;
  352 
  353         KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
  354             ("vcpu_init: invalid vcpu %d", vcpu_id));
  355 
  356         vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO);
  357         vcpu_lock_init(vcpu);
  358         vcpu->state = VCPU_IDLE;
  359         vcpu->hostcpu = NOCPU;
  360         vcpu->vcpuid = vcpu_id;
  361         vcpu->vm = vm;
  362         vcpu->guestfpu = fpu_save_area_alloc();
  363         vcpu->stats = vmm_stat_alloc();
  364         vcpu->tsc_offset = 0;
  365         return (vcpu);
  366 }
  367 
  368 static void
  369 vcpu_init(struct vcpu *vcpu)
  370 {
  371         vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
  372         vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie);
  373         vm_set_x2apic_state(vcpu, X2APIC_DISABLED);
  374         vcpu->reqidle = 0;
  375         vcpu->exitintinfo = 0;
  376         vcpu->nmi_pending = 0;
  377         vcpu->extint_pending = 0;
  378         vcpu->exception_pending = 0;
  379         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
  380         fpu_save_area_reset(vcpu->guestfpu);
  381         vmm_stat_init(vcpu->stats);
  382 }
  383 
  384 int
  385 vcpu_trace_exceptions(struct vcpu *vcpu)
  386 {
  387 
  388         return (trace_guest_exceptions);
  389 }
  390 
  391 int
  392 vcpu_trap_wbinvd(struct vcpu *vcpu)
  393 {
  394         return (trap_wbinvd);
  395 }
  396 
  397 struct vm_exit *
  398 vm_exitinfo(struct vcpu *vcpu)
  399 {
  400         return (&vcpu->exitinfo);
  401 }
  402 
  403 static int
  404 vmm_init(void)
  405 {
  406         int error;
  407 
  408         if (!vmm_is_hw_supported())
  409                 return (ENXIO);
  410 
  411         vm_maxcpu = mp_ncpus;
  412         TUNABLE_INT_FETCH("hw.vmm.maxcpu", &vm_maxcpu);
  413 
  414         if (vm_maxcpu > VM_MAXCPU) {
  415                 printf("vmm: vm_maxcpu clamped to %u\n", VM_MAXCPU);
  416                 vm_maxcpu = VM_MAXCPU;
  417         }
  418         if (vm_maxcpu == 0)
  419                 vm_maxcpu = 1;
  420 
  421         vmm_host_state_init();
  422 
  423         vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
  424             &IDTVEC(justreturn));
  425         if (vmm_ipinum < 0)
  426                 vmm_ipinum = IPI_AST;
  427 
  428         error = vmm_mem_init();
  429         if (error)
  430                 return (error);
  431 
  432         vmm_resume_p = vmmops_modresume;
  433 
  434         return (vmmops_modinit(vmm_ipinum));
  435 }
  436 
  437 static int
  438 vmm_handler(module_t mod, int what, void *arg)
  439 {
  440         int error;
  441 
  442         switch (what) {
  443         case MOD_LOAD:
  444                 if (vmm_is_hw_supported()) {
  445                         vmmdev_init();
  446                         error = vmm_init();
  447                         if (error == 0)
  448                                 vmm_initialized = 1;
  449                 } else {
  450                         error = ENXIO;
  451                 }
  452                 break;
  453         case MOD_UNLOAD:
  454                 if (vmm_is_hw_supported()) {
  455                         error = vmmdev_cleanup();
  456                         if (error == 0) {
  457                                 vmm_resume_p = NULL;
  458                                 iommu_cleanup();
  459                                 if (vmm_ipinum != IPI_AST)
  460                                         lapic_ipi_free(vmm_ipinum);
  461                                 error = vmmops_modcleanup();
  462                                 /*
  463                                  * Something bad happened - prevent new
  464                                  * VMs from being created
  465                                  */
  466                                 if (error)
  467                                         vmm_initialized = 0;
  468                         }
  469                 } else {
  470                         error = 0;
  471                 }
  472                 break;
  473         default:
  474                 error = 0;
  475                 break;
  476         }
  477         return (error);
  478 }
  479 
  480 static moduledata_t vmm_kmod = {
  481         "vmm",
  482         vmm_handler,
  483         NULL
  484 };
  485 
  486 /*
  487  * vmm initialization has the following dependencies:
  488  *
  489  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  490  *   after SMP is fully functional (after SI_SUB_SMP).
  491  */
  492 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
  493 MODULE_VERSION(vmm, 1);
  494 
  495 static void
  496 vm_init(struct vm *vm, bool create)
  497 {
  498         vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
  499         vm->iommu = NULL;
  500         vm->vioapic = vioapic_init(vm);
  501         vm->vhpet = vhpet_init(vm);
  502         vm->vatpic = vatpic_init(vm);
  503         vm->vatpit = vatpit_init(vm);
  504         vm->vpmtmr = vpmtmr_init(vm);
  505         if (create)
  506                 vm->vrtc = vrtc_init(vm);
  507 
  508         CPU_ZERO(&vm->active_cpus);
  509         CPU_ZERO(&vm->debug_cpus);
  510         CPU_ZERO(&vm->startup_cpus);
  511 
  512         vm->suspend = 0;
  513         CPU_ZERO(&vm->suspended_cpus);
  514 
  515         if (!create) {
  516                 for (int i = 0; i < vm->maxcpus; i++) {
  517                         if (vm->vcpu[i] != NULL)
  518                                 vcpu_init(vm->vcpu[i]);
  519                 }
  520         }
  521 }
  522 
  523 void
  524 vm_disable_vcpu_creation(struct vm *vm)
  525 {
  526         sx_xlock(&vm->vcpus_init_lock);
  527         vm->dying = true;
  528         sx_xunlock(&vm->vcpus_init_lock);
  529 }
  530 
  531 struct vcpu *
  532 vm_alloc_vcpu(struct vm *vm, int vcpuid)
  533 {
  534         struct vcpu *vcpu;
  535 
  536         if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
  537                 return (NULL);
  538 
  539         vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]);
  540         if (__predict_true(vcpu != NULL))
  541                 return (vcpu);
  542 
  543         sx_xlock(&vm->vcpus_init_lock);
  544         vcpu = vm->vcpu[vcpuid];
  545         if (vcpu == NULL && !vm->dying) {
  546                 vcpu = vcpu_alloc(vm, vcpuid);
  547                 vcpu_init(vcpu);
  548 
  549                 /*
  550                  * Ensure vCPU is fully created before updating pointer
  551                  * to permit unlocked reads above.
  552                  */
  553                 atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
  554                     (uintptr_t)vcpu);
  555         }
  556         sx_xunlock(&vm->vcpus_init_lock);
  557         return (vcpu);
  558 }
  559 
  560 void
  561 vm_slock_vcpus(struct vm *vm)
  562 {
  563         sx_slock(&vm->vcpus_init_lock);
  564 }
  565 
  566 void
  567 vm_unlock_vcpus(struct vm *vm)
  568 {
  569         sx_unlock(&vm->vcpus_init_lock);
  570 }
  571 
  572 /*
  573  * The default CPU topology is a single thread per package.
  574  */
  575 u_int cores_per_package = 1;
  576 u_int threads_per_core = 1;
  577 
  578 int
  579 vm_create(const char *name, struct vm **retvm)
  580 {
  581         struct vm *vm;
  582         struct vmspace *vmspace;
  583 
  584         /*
  585          * If vmm.ko could not be successfully initialized then don't attempt
  586          * to create the virtual machine.
  587          */
  588         if (!vmm_initialized)
  589                 return (ENXIO);
  590 
  591         if (name == NULL || strnlen(name, VM_MAX_NAMELEN + 1) ==
  592             VM_MAX_NAMELEN + 1)
  593                 return (EINVAL);
  594 
  595         vmspace = vmmops_vmspace_alloc(0, VM_MAXUSER_ADDRESS_LA48);
  596         if (vmspace == NULL)
  597                 return (ENOMEM);
  598 
  599         vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
  600         strcpy(vm->name, name);
  601         vm->vmspace = vmspace;
  602         mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
  603         sx_init(&vm->mem_segs_lock, "vm mem_segs");
  604         sx_init(&vm->vcpus_init_lock, "vm vcpus");
  605         vm->vcpu = malloc(sizeof(*vm->vcpu) * vm_maxcpu, M_VM, M_WAITOK |
  606             M_ZERO);
  607 
  608         vm->sockets = 1;
  609         vm->cores = cores_per_package;  /* XXX backwards compatibility */
  610         vm->threads = threads_per_core; /* XXX backwards compatibility */
  611         vm->maxcpus = vm_maxcpu;
  612 
  613         vm_init(vm, true);
  614 
  615         *retvm = vm;
  616         return (0);
  617 }
  618 
  619 void
  620 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
  621     uint16_t *threads, uint16_t *maxcpus)
  622 {
  623         *sockets = vm->sockets;
  624         *cores = vm->cores;
  625         *threads = vm->threads;
  626         *maxcpus = vm->maxcpus;
  627 }
  628 
  629 uint16_t
  630 vm_get_maxcpus(struct vm *vm)
  631 {
  632         return (vm->maxcpus);
  633 }
  634 
  635 int
  636 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
  637     uint16_t threads, uint16_t maxcpus __unused)
  638 {
  639         /* Ignore maxcpus. */
  640         if ((sockets * cores * threads) > vm->maxcpus)
  641                 return (EINVAL);
  642         vm->sockets = sockets;
  643         vm->cores = cores;
  644         vm->threads = threads;
  645         return(0);
  646 }
  647 
  648 static void
  649 vm_cleanup(struct vm *vm, bool destroy)
  650 {
  651         struct mem_map *mm;
  652         int i;
  653 
  654         if (destroy)
  655                 vm_xlock_memsegs(vm);
  656 
  657         ppt_unassign_all(vm);
  658 
  659         if (vm->iommu != NULL)
  660                 iommu_destroy_domain(vm->iommu);
  661 
  662         if (destroy)
  663                 vrtc_cleanup(vm->vrtc);
  664         else
  665                 vrtc_reset(vm->vrtc);
  666         vpmtmr_cleanup(vm->vpmtmr);
  667         vatpit_cleanup(vm->vatpit);
  668         vhpet_cleanup(vm->vhpet);
  669         vatpic_cleanup(vm->vatpic);
  670         vioapic_cleanup(vm->vioapic);
  671 
  672         for (i = 0; i < vm->maxcpus; i++) {
  673                 if (vm->vcpu[i] != NULL)
  674                         vcpu_cleanup(vm->vcpu[i], destroy);
  675         }
  676 
  677         vmmops_cleanup(vm->cookie);
  678 
  679         /*
  680          * System memory is removed from the guest address space only when
  681          * the VM is destroyed. This is because the mapping remains the same
  682          * across VM reset.
  683          *
  684          * Device memory can be relocated by the guest (e.g. using PCI BARs)
  685          * so those mappings are removed on a VM reset.
  686          */
  687         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
  688                 mm = &vm->mem_maps[i];
  689                 if (destroy || !sysmem_mapping(vm, mm))
  690                         vm_free_memmap(vm, i);
  691         }
  692 
  693         if (destroy) {
  694                 for (i = 0; i < VM_MAX_MEMSEGS; i++)
  695                         vm_free_memseg(vm, i);
  696                 vm_unlock_memsegs(vm);
  697 
  698                 vmmops_vmspace_free(vm->vmspace);
  699                 vm->vmspace = NULL;
  700 
  701                 free(vm->vcpu, M_VM);
  702                 sx_destroy(&vm->vcpus_init_lock);
  703                 sx_destroy(&vm->mem_segs_lock);
  704                 mtx_destroy(&vm->rendezvous_mtx);
  705         }
  706 }
  707 
  708 void
  709 vm_destroy(struct vm *vm)
  710 {
  711         vm_cleanup(vm, true);
  712         free(vm, M_VM);
  713 }
  714 
  715 int
  716 vm_reinit(struct vm *vm)
  717 {
  718         int error;
  719 
  720         /*
  721          * A virtual machine can be reset only if all vcpus are suspended.
  722          */
  723         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
  724                 vm_cleanup(vm, false);
  725                 vm_init(vm, false);
  726                 error = 0;
  727         } else {
  728                 error = EBUSY;
  729         }
  730 
  731         return (error);
  732 }
  733 
  734 const char *
  735 vm_name(struct vm *vm)
  736 {
  737         return (vm->name);
  738 }
  739 
  740 void
  741 vm_slock_memsegs(struct vm *vm)
  742 {
  743         sx_slock(&vm->mem_segs_lock);
  744 }
  745 
  746 void
  747 vm_xlock_memsegs(struct vm *vm)
  748 {
  749         sx_xlock(&vm->mem_segs_lock);
  750 }
  751 
  752 void
  753 vm_unlock_memsegs(struct vm *vm)
  754 {
  755         sx_unlock(&vm->mem_segs_lock);
  756 }
  757 
  758 int
  759 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
  760 {
  761         vm_object_t obj;
  762 
  763         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
  764                 return (ENOMEM);
  765         else
  766                 return (0);
  767 }
  768 
  769 int
  770 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
  771 {
  772 
  773         vmm_mmio_free(vm->vmspace, gpa, len);
  774         return (0);
  775 }
  776 
  777 /*
  778  * Return 'true' if 'gpa' is allocated in the guest address space.
  779  *
  780  * This function is called in the context of a running vcpu which acts as
  781  * an implicit lock on 'vm->mem_maps[]'.
  782  */
  783 bool
  784 vm_mem_allocated(struct vcpu *vcpu, vm_paddr_t gpa)
  785 {
  786         struct vm *vm = vcpu->vm;
  787         struct mem_map *mm;
  788         int i;
  789 
  790 #ifdef INVARIANTS
  791         int hostcpu, state;
  792         state = vcpu_get_state(vcpu, &hostcpu);
  793         KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
  794             ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
  795 #endif
  796 
  797         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
  798                 mm = &vm->mem_maps[i];
  799                 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
  800                         return (true);          /* 'gpa' is sysmem or devmem */
  801         }
  802 
  803         if (ppt_is_mmio(vm, gpa))
  804                 return (true);                  /* 'gpa' is pci passthru mmio */
  805 
  806         return (false);
  807 }
  808 
  809 int
  810 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
  811 {
  812         struct mem_seg *seg;
  813         vm_object_t obj;
  814 
  815         sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
  816 
  817         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
  818                 return (EINVAL);
  819 
  820         if (len == 0 || (len & PAGE_MASK))
  821                 return (EINVAL);
  822 
  823         seg = &vm->mem_segs[ident];
  824         if (seg->object != NULL) {
  825                 if (seg->len == len && seg->sysmem == sysmem)
  826                         return (EEXIST);
  827                 else
  828                         return (EINVAL);
  829         }
  830 
  831         obj = vm_object_allocate(OBJT_SWAP, len >> PAGE_SHIFT);
  832         if (obj == NULL)
  833                 return (ENOMEM);
  834 
  835         seg->len = len;
  836         seg->object = obj;
  837         seg->sysmem = sysmem;
  838         return (0);
  839 }
  840 
  841 int
  842 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
  843     vm_object_t *objptr)
  844 {
  845         struct mem_seg *seg;
  846 
  847         sx_assert(&vm->mem_segs_lock, SX_LOCKED);
  848 
  849         if (ident < 0 || ident >= VM_MAX_MEMSEGS)
  850                 return (EINVAL);
  851 
  852         seg = &vm->mem_segs[ident];
  853         if (len)
  854                 *len = seg->len;
  855         if (sysmem)
  856                 *sysmem = seg->sysmem;
  857         if (objptr)
  858                 *objptr = seg->object;
  859         return (0);
  860 }
  861 
  862 void
  863 vm_free_memseg(struct vm *vm, int ident)
  864 {
  865         struct mem_seg *seg;
  866 
  867         KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
  868             ("%s: invalid memseg ident %d", __func__, ident));
  869 
  870         seg = &vm->mem_segs[ident];
  871         if (seg->object != NULL) {
  872                 vm_object_deallocate(seg->object);
  873                 bzero(seg, sizeof(struct mem_seg));
  874         }
  875 }
  876 
  877 int
  878 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
  879     size_t len, int prot, int flags)
  880 {
  881         struct mem_seg *seg;
  882         struct mem_map *m, *map;
  883         vm_ooffset_t last;
  884         int i, error;
  885 
  886         if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
  887                 return (EINVAL);
  888 
  889         if (flags & ~VM_MEMMAP_F_WIRED)
  890                 return (EINVAL);
  891 
  892         if (segid < 0 || segid >= VM_MAX_MEMSEGS)
  893                 return (EINVAL);
  894 
  895         seg = &vm->mem_segs[segid];
  896         if (seg->object == NULL)
  897                 return (EINVAL);
  898 
  899         last = first + len;
  900         if (first < 0 || first >= last || last > seg->len)
  901                 return (EINVAL);
  902 
  903         if ((gpa | first | last) & PAGE_MASK)
  904                 return (EINVAL);
  905 
  906         map = NULL;
  907         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
  908                 m = &vm->mem_maps[i];
  909                 if (m->len == 0) {
  910                         map = m;
  911                         break;
  912                 }
  913         }
  914 
  915         if (map == NULL)
  916                 return (ENOSPC);
  917 
  918         error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
  919             len, 0, VMFS_NO_SPACE, prot, prot, 0);
  920         if (error != KERN_SUCCESS)
  921                 return (EFAULT);
  922 
  923         vm_object_reference(seg->object);
  924 
  925         if (flags & VM_MEMMAP_F_WIRED) {
  926                 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
  927                     VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
  928                 if (error != KERN_SUCCESS) {
  929                         vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
  930                         return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM :
  931                             EFAULT);
  932                 }
  933         }
  934 
  935         map->gpa = gpa;
  936         map->len = len;
  937         map->segoff = first;
  938         map->segid = segid;
  939         map->prot = prot;
  940         map->flags = flags;
  941         return (0);
  942 }
  943 
  944 int
  945 vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len)
  946 {
  947         struct mem_map *m;
  948         int i;
  949 
  950         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
  951                 m = &vm->mem_maps[i];
  952                 if (m->gpa == gpa && m->len == len &&
  953                     (m->flags & VM_MEMMAP_F_IOMMU) == 0) {
  954                         vm_free_memmap(vm, i);
  955                         return (0);
  956                 }
  957         }
  958 
  959         return (EINVAL);
  960 }
  961 
  962 int
  963 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
  964     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
  965 {
  966         struct mem_map *mm, *mmnext;
  967         int i;
  968 
  969         mmnext = NULL;
  970         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
  971                 mm = &vm->mem_maps[i];
  972                 if (mm->len == 0 || mm->gpa < *gpa)
  973                         continue;
  974                 if (mmnext == NULL || mm->gpa < mmnext->gpa)
  975                         mmnext = mm;
  976         }
  977 
  978         if (mmnext != NULL) {
  979                 *gpa = mmnext->gpa;
  980                 if (segid)
  981                         *segid = mmnext->segid;
  982                 if (segoff)
  983                         *segoff = mmnext->segoff;
  984                 if (len)
  985                         *len = mmnext->len;
  986                 if (prot)
  987                         *prot = mmnext->prot;
  988                 if (flags)
  989                         *flags = mmnext->flags;
  990                 return (0);
  991         } else {
  992                 return (ENOENT);
  993         }
  994 }
  995 
  996 static void
  997 vm_free_memmap(struct vm *vm, int ident)
  998 {
  999         struct mem_map *mm;
 1000         int error __diagused;
 1001 
 1002         mm = &vm->mem_maps[ident];
 1003         if (mm->len) {
 1004                 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
 1005                     mm->gpa + mm->len);
 1006                 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
 1007                     __func__, error));
 1008                 bzero(mm, sizeof(struct mem_map));
 1009         }
 1010 }
 1011 
 1012 static __inline bool
 1013 sysmem_mapping(struct vm *vm, struct mem_map *mm)
 1014 {
 1015 
 1016         if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
 1017                 return (true);
 1018         else
 1019                 return (false);
 1020 }
 1021 
 1022 vm_paddr_t
 1023 vmm_sysmem_maxaddr(struct vm *vm)
 1024 {
 1025         struct mem_map *mm;
 1026         vm_paddr_t maxaddr;
 1027         int i;
 1028 
 1029         maxaddr = 0;
 1030         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 1031                 mm = &vm->mem_maps[i];
 1032                 if (sysmem_mapping(vm, mm)) {
 1033                         if (maxaddr < mm->gpa + mm->len)
 1034                                 maxaddr = mm->gpa + mm->len;
 1035                 }
 1036         }
 1037         return (maxaddr);
 1038 }
 1039 
 1040 static void
 1041 vm_iommu_modify(struct vm *vm, bool map)
 1042 {
 1043         int i, sz;
 1044         vm_paddr_t gpa, hpa;
 1045         struct mem_map *mm;
 1046         void *vp, *cookie, *host_domain;
 1047 
 1048         sz = PAGE_SIZE;
 1049         host_domain = iommu_host_domain();
 1050 
 1051         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 1052                 mm = &vm->mem_maps[i];
 1053                 if (!sysmem_mapping(vm, mm))
 1054                         continue;
 1055 
 1056                 if (map) {
 1057                         KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
 1058                             ("iommu map found invalid memmap %#lx/%#lx/%#x",
 1059                             mm->gpa, mm->len, mm->flags));
 1060                         if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
 1061                                 continue;
 1062                         mm->flags |= VM_MEMMAP_F_IOMMU;
 1063                 } else {
 1064                         if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
 1065                                 continue;
 1066                         mm->flags &= ~VM_MEMMAP_F_IOMMU;
 1067                         KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
 1068                             ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
 1069                             mm->gpa, mm->len, mm->flags));
 1070                 }
 1071 
 1072                 gpa = mm->gpa;
 1073                 while (gpa < mm->gpa + mm->len) {
 1074                         vp = vm_gpa_hold_global(vm, gpa, PAGE_SIZE,
 1075                             VM_PROT_WRITE, &cookie);
 1076                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
 1077                             vm_name(vm), gpa));
 1078 
 1079                         vm_gpa_release(cookie);
 1080 
 1081                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
 1082                         if (map) {
 1083                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
 1084                         } else {
 1085                                 iommu_remove_mapping(vm->iommu, gpa, sz);
 1086                         }
 1087 
 1088                         gpa += PAGE_SIZE;
 1089                 }
 1090         }
 1091 
 1092         /*
 1093          * Invalidate the cached translations associated with the domain
 1094          * from which pages were removed.
 1095          */
 1096         if (map)
 1097                 iommu_invalidate_tlb(host_domain);
 1098         else
 1099                 iommu_invalidate_tlb(vm->iommu);
 1100 }
 1101 
 1102 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), false)
 1103 #define vm_iommu_map(vm)        vm_iommu_modify((vm), true)
 1104 
 1105 int
 1106 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
 1107 {
 1108         int error;
 1109 
 1110         error = ppt_unassign_device(vm, bus, slot, func);
 1111         if (error)
 1112                 return (error);
 1113 
 1114         if (ppt_assigned_devices(vm) == 0)
 1115                 vm_iommu_unmap(vm);
 1116 
 1117         return (0);
 1118 }
 1119 
 1120 int
 1121 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
 1122 {
 1123         int error;
 1124         vm_paddr_t maxaddr;
 1125 
 1126         /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
 1127         if (ppt_assigned_devices(vm) == 0) {
 1128                 KASSERT(vm->iommu == NULL,
 1129                     ("vm_assign_pptdev: iommu must be NULL"));
 1130                 maxaddr = vmm_sysmem_maxaddr(vm);
 1131                 vm->iommu = iommu_create_domain(maxaddr);
 1132                 if (vm->iommu == NULL)
 1133                         return (ENXIO);
 1134                 vm_iommu_map(vm);
 1135         }
 1136 
 1137         error = ppt_assign_device(vm, bus, slot, func);
 1138         return (error);
 1139 }
 1140 
 1141 static void *
 1142 _vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 1143     void **cookie)
 1144 {
 1145         int i, count, pageoff;
 1146         struct mem_map *mm;
 1147         vm_page_t m;
 1148 
 1149         pageoff = gpa & PAGE_MASK;
 1150         if (len > PAGE_SIZE - pageoff)
 1151                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
 1152 
 1153         count = 0;
 1154         for (i = 0; i < VM_MAX_MEMMAPS; i++) {
 1155                 mm = &vm->mem_maps[i];
 1156                 if (gpa >= mm->gpa && gpa < mm->gpa + mm->len) {
 1157                         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
 1158                             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
 1159                         break;
 1160                 }
 1161         }
 1162 
 1163         if (count == 1) {
 1164                 *cookie = m;
 1165                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
 1166         } else {
 1167                 *cookie = NULL;
 1168                 return (NULL);
 1169         }
 1170 }
 1171 
 1172 void *
 1173 vm_gpa_hold(struct vcpu *vcpu, vm_paddr_t gpa, size_t len, int reqprot,
 1174     void **cookie)
 1175 {
 1176 #ifdef INVARIANTS
 1177         /*
 1178          * The current vcpu should be frozen to ensure 'vm_memmap[]'
 1179          * stability.
 1180          */
 1181         int state = vcpu_get_state(vcpu, NULL);
 1182         KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
 1183             __func__, state));
 1184 #endif
 1185         return (_vm_gpa_hold(vcpu->vm, gpa, len, reqprot, cookie));
 1186 }
 1187 
 1188 void *
 1189 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
 1190     void **cookie)
 1191 {
 1192         sx_assert(&vm->mem_segs_lock, SX_LOCKED);
 1193         return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
 1194 }
 1195 
 1196 void
 1197 vm_gpa_release(void *cookie)
 1198 {
 1199         vm_page_t m = cookie;
 1200 
 1201         vm_page_unwire(m, PQ_ACTIVE);
 1202 }
 1203 
 1204 int
 1205 vm_get_register(struct vcpu *vcpu, int reg, uint64_t *retval)
 1206 {
 1207 
 1208         if (reg >= VM_REG_LAST)
 1209                 return (EINVAL);
 1210 
 1211         return (vmmops_getreg(vcpu->cookie, reg, retval));
 1212 }
 1213 
 1214 int
 1215 vm_set_register(struct vcpu *vcpu, int reg, uint64_t val)
 1216 {
 1217         int error;
 1218 
 1219         if (reg >= VM_REG_LAST)
 1220                 return (EINVAL);
 1221 
 1222         error = vmmops_setreg(vcpu->cookie, reg, val);
 1223         if (error || reg != VM_REG_GUEST_RIP)
 1224                 return (error);
 1225 
 1226         /* Set 'nextrip' to match the value of %rip */
 1227         VMM_CTR1(vcpu, "Setting nextrip to %#lx", val);
 1228         vcpu->nextrip = val;
 1229         return (0);
 1230 }
 1231 
 1232 static bool
 1233 is_descriptor_table(int reg)
 1234 {
 1235 
 1236         switch (reg) {
 1237         case VM_REG_GUEST_IDTR:
 1238         case VM_REG_GUEST_GDTR:
 1239                 return (true);
 1240         default:
 1241                 return (false);
 1242         }
 1243 }
 1244 
 1245 static bool
 1246 is_segment_register(int reg)
 1247 {
 1248 
 1249         switch (reg) {
 1250         case VM_REG_GUEST_ES:
 1251         case VM_REG_GUEST_CS:
 1252         case VM_REG_GUEST_SS:
 1253         case VM_REG_GUEST_DS:
 1254         case VM_REG_GUEST_FS:
 1255         case VM_REG_GUEST_GS:
 1256         case VM_REG_GUEST_TR:
 1257         case VM_REG_GUEST_LDTR:
 1258                 return (true);
 1259         default:
 1260                 return (false);
 1261         }
 1262 }
 1263 
 1264 int
 1265 vm_get_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc)
 1266 {
 1267 
 1268         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 1269                 return (EINVAL);
 1270 
 1271         return (vmmops_getdesc(vcpu->cookie, reg, desc));
 1272 }
 1273 
 1274 int
 1275 vm_set_seg_desc(struct vcpu *vcpu, int reg, struct seg_desc *desc)
 1276 {
 1277 
 1278         if (!is_segment_register(reg) && !is_descriptor_table(reg))
 1279                 return (EINVAL);
 1280 
 1281         return (vmmops_setdesc(vcpu->cookie, reg, desc));
 1282 }
 1283 
 1284 static void
 1285 restore_guest_fpustate(struct vcpu *vcpu)
 1286 {
 1287 
 1288         /* flush host state to the pcb */
 1289         fpuexit(curthread);
 1290 
 1291         /* restore guest FPU state */
 1292         fpu_stop_emulating();
 1293         fpurestore(vcpu->guestfpu);
 1294 
 1295         /* restore guest XCR0 if XSAVE is enabled in the host */
 1296         if (rcr4() & CR4_XSAVE)
 1297                 load_xcr(0, vcpu->guest_xcr0);
 1298 
 1299         /*
 1300          * The FPU is now "dirty" with the guest's state so turn on emulation
 1301          * to trap any access to the FPU by the host.
 1302          */
 1303         fpu_start_emulating();
 1304 }
 1305 
 1306 static void
 1307 save_guest_fpustate(struct vcpu *vcpu)
 1308 {
 1309 
 1310         if ((rcr0() & CR0_TS) == 0)
 1311                 panic("fpu emulation not enabled in host!");
 1312 
 1313         /* save guest XCR0 and restore host XCR0 */
 1314         if (rcr4() & CR4_XSAVE) {
 1315                 vcpu->guest_xcr0 = rxcr(0);
 1316                 load_xcr(0, vmm_get_host_xcr0());
 1317         }
 1318 
 1319         /* save guest FPU state */
 1320         fpu_stop_emulating();
 1321         fpusave(vcpu->guestfpu);
 1322         fpu_start_emulating();
 1323 }
 1324 
 1325 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 1326 
 1327 static int
 1328 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 1329     bool from_idle)
 1330 {
 1331         int error;
 1332 
 1333         vcpu_assert_locked(vcpu);
 1334 
 1335         /*
 1336          * State transitions from the vmmdev_ioctl() must always begin from
 1337          * the VCPU_IDLE state. This guarantees that there is only a single
 1338          * ioctl() operating on a vcpu at any point.
 1339          */
 1340         if (from_idle) {
 1341                 while (vcpu->state != VCPU_IDLE) {
 1342                         vcpu->reqidle = 1;
 1343                         vcpu_notify_event_locked(vcpu, false);
 1344                         VMM_CTR1(vcpu, "vcpu state change from %s to "
 1345                             "idle requested", vcpu_state2str(vcpu->state));
 1346                         msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
 1347                 }
 1348         } else {
 1349                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 1350                     "vcpu idle state"));
 1351         }
 1352 
 1353         if (vcpu->state == VCPU_RUNNING) {
 1354                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
 1355                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
 1356         } else {
 1357                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
 1358                     "vcpu that is not running", vcpu->hostcpu));
 1359         }
 1360 
 1361         /*
 1362          * The following state transitions are allowed:
 1363          * IDLE -> FROZEN -> IDLE
 1364          * FROZEN -> RUNNING -> FROZEN
 1365          * FROZEN -> SLEEPING -> FROZEN
 1366          */
 1367         switch (vcpu->state) {
 1368         case VCPU_IDLE:
 1369         case VCPU_RUNNING:
 1370         case VCPU_SLEEPING:
 1371                 error = (newstate != VCPU_FROZEN);
 1372                 break;
 1373         case VCPU_FROZEN:
 1374                 error = (newstate == VCPU_FROZEN);
 1375                 break;
 1376         default:
 1377                 error = 1;
 1378                 break;
 1379         }
 1380 
 1381         if (error)
 1382                 return (EBUSY);
 1383 
 1384         VMM_CTR2(vcpu, "vcpu state changed from %s to %s",
 1385             vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
 1386 
 1387         vcpu->state = newstate;
 1388         if (newstate == VCPU_RUNNING)
 1389                 vcpu->hostcpu = curcpu;
 1390         else
 1391                 vcpu->hostcpu = NOCPU;
 1392 
 1393         if (newstate == VCPU_IDLE)
 1394                 wakeup(&vcpu->state);
 1395 
 1396         return (0);
 1397 }
 1398 
 1399 static void
 1400 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
 1401 {
 1402         int error;
 1403 
 1404         if ((error = vcpu_set_state(vcpu, newstate, false)) != 0)
 1405                 panic("Error %d setting state to %d\n", error, newstate);
 1406 }
 1407 
 1408 static void
 1409 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 1410 {
 1411         int error;
 1412 
 1413         if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 1414                 panic("Error %d setting state to %d", error, newstate);
 1415 }
 1416 
 1417 static int
 1418 vm_handle_rendezvous(struct vcpu *vcpu)
 1419 {
 1420         struct vm *vm = vcpu->vm;
 1421         struct thread *td;
 1422         int error, vcpuid;
 1423 
 1424         error = 0;
 1425         vcpuid = vcpu->vcpuid;
 1426         td = curthread;
 1427         mtx_lock(&vm->rendezvous_mtx);
 1428         while (vm->rendezvous_func != NULL) {
 1429                 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 1430                 CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
 1431 
 1432                 if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 1433                     !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 1434                         VMM_CTR0(vcpu, "Calling rendezvous func");
 1435                         (*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
 1436                         CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 1437                 }
 1438                 if (CPU_CMP(&vm->rendezvous_req_cpus,
 1439                     &vm->rendezvous_done_cpus) == 0) {
 1440                         VMM_CTR0(vcpu, "Rendezvous completed");
 1441                         vm->rendezvous_func = NULL;
 1442                         wakeup(&vm->rendezvous_func);
 1443                         break;
 1444                 }
 1445                 VMM_CTR0(vcpu, "Wait for rendezvous completion");
 1446                 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 1447                     "vmrndv", hz);
 1448                 if (td_ast_pending(td, TDA_SUSPEND)) {
 1449                         mtx_unlock(&vm->rendezvous_mtx);
 1450                         error = thread_check_susp(td, true);
 1451                         if (error != 0)
 1452                                 return (error);
 1453                         mtx_lock(&vm->rendezvous_mtx);
 1454                 }
 1455         }
 1456         mtx_unlock(&vm->rendezvous_mtx);
 1457         return (0);
 1458 }
 1459 
 1460 /*
 1461  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
 1462  */
 1463 static int
 1464 vm_handle_hlt(struct vcpu *vcpu, bool intr_disabled, bool *retu)
 1465 {
 1466         struct vm *vm = vcpu->vm;
 1467         const char *wmesg;
 1468         struct thread *td;
 1469         int error, t, vcpuid, vcpu_halted, vm_halted;
 1470 
 1471         vcpuid = vcpu->vcpuid;
 1472         vcpu_halted = 0;
 1473         vm_halted = 0;
 1474         error = 0;
 1475         td = curthread;
 1476 
 1477         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 1478 
 1479         vcpu_lock(vcpu);
 1480         while (1) {
 1481                 /*
 1482                  * Do a final check for pending NMI or interrupts before
 1483                  * really putting this thread to sleep. Also check for
 1484                  * software events that would cause this vcpu to wakeup.
 1485                  *
 1486                  * These interrupts/events could have happened after the
 1487                  * vcpu returned from vmmops_run() and before it acquired the
 1488                  * vcpu lock above.
 1489                  */
 1490                 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
 1491                         break;
 1492                 if (vm_nmi_pending(vcpu))
 1493                         break;
 1494                 if (!intr_disabled) {
 1495                         if (vm_extint_pending(vcpu) ||
 1496                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
 1497                                 break;
 1498                         }
 1499                 }
 1500 
 1501                 /* Don't go to sleep if the vcpu thread needs to yield */
 1502                 if (vcpu_should_yield(vcpu))
 1503                         break;
 1504 
 1505                 if (vcpu_debugged(vcpu))
 1506                         break;
 1507 
 1508                 /*
 1509                  * Some Linux guests implement "halt" by having all vcpus
 1510                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
 1511                  * track of the vcpus that have entered this state. When all
 1512                  * vcpus enter the halted state the virtual machine is halted.
 1513                  */
 1514                 if (intr_disabled) {
 1515                         wmesg = "vmhalt";
 1516                         VMM_CTR0(vcpu, "Halted");
 1517                         if (!vcpu_halted && halt_detection_enabled) {
 1518                                 vcpu_halted = 1;
 1519                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 1520                         }
 1521                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 1522                                 vm_halted = 1;
 1523                                 break;
 1524                         }
 1525                 } else {
 1526                         wmesg = "vmidle";
 1527                 }
 1528 
 1529                 t = ticks;
 1530                 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 1531                 /*
 1532                  * XXX msleep_spin() cannot be interrupted by signals so
 1533                  * wake up periodically to check pending signals.
 1534                  */
 1535                 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 1536                 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 1537                 vmm_stat_incr(vcpu, VCPU_IDLE_TICKS, ticks - t);
 1538                 if (td_ast_pending(td, TDA_SUSPEND)) {
 1539                         vcpu_unlock(vcpu);
 1540                         error = thread_check_susp(td, false);
 1541                         if (error != 0) {
 1542                                 if (vcpu_halted) {
 1543                                         CPU_CLR_ATOMIC(vcpuid,
 1544                                             &vm->halted_cpus);
 1545                                 }
 1546                                 return (error);
 1547                         }
 1548                         vcpu_lock(vcpu);
 1549                 }
 1550         }
 1551 
 1552         if (vcpu_halted)
 1553                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 1554 
 1555         vcpu_unlock(vcpu);
 1556 
 1557         if (vm_halted)
 1558                 vm_suspend(vm, VM_SUSPEND_HALT);
 1559 
 1560         return (0);
 1561 }
 1562 
 1563 static int
 1564 vm_handle_paging(struct vcpu *vcpu, bool *retu)
 1565 {
 1566         struct vm *vm = vcpu->vm;
 1567         int rv, ftype;
 1568         struct vm_map *map;
 1569         struct vm_exit *vme;
 1570 
 1571         vme = &vcpu->exitinfo;
 1572 
 1573         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 1574             __func__, vme->inst_length));
 1575 
 1576         ftype = vme->u.paging.fault_type;
 1577         KASSERT(ftype == VM_PROT_READ ||
 1578             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 1579             ("vm_handle_paging: invalid fault_type %d", ftype));
 1580 
 1581         if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 1582                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 1583                     vme->u.paging.gpa, ftype);
 1584                 if (rv == 0) {
 1585                         VMM_CTR2(vcpu, "%s bit emulation for gpa %#lx",
 1586                             ftype == VM_PROT_READ ? "accessed" : "dirty",
 1587                             vme->u.paging.gpa);
 1588                         goto done;
 1589                 }
 1590         }
 1591 
 1592         map = &vm->vmspace->vm_map;
 1593         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL, NULL);
 1594 
 1595         VMM_CTR3(vcpu, "vm_handle_paging rv = %d, gpa = %#lx, "
 1596             "ftype = %d", rv, vme->u.paging.gpa, ftype);
 1597 
 1598         if (rv != KERN_SUCCESS)
 1599                 return (EFAULT);
 1600 done:
 1601         return (0);
 1602 }
 1603 
 1604 static int
 1605 vm_handle_inst_emul(struct vcpu *vcpu, bool *retu)
 1606 {
 1607         struct vie *vie;
 1608         struct vm_exit *vme;
 1609         uint64_t gla, gpa, cs_base;
 1610         struct vm_guest_paging *paging;
 1611         mem_region_read_t mread;
 1612         mem_region_write_t mwrite;
 1613         enum vm_cpu_mode cpu_mode;
 1614         int cs_d, error, fault;
 1615 
 1616         vme = &vcpu->exitinfo;
 1617 
 1618         KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
 1619             __func__, vme->inst_length));
 1620 
 1621         gla = vme->u.inst_emul.gla;
 1622         gpa = vme->u.inst_emul.gpa;
 1623         cs_base = vme->u.inst_emul.cs_base;
 1624         cs_d = vme->u.inst_emul.cs_d;
 1625         vie = &vme->u.inst_emul.vie;
 1626         paging = &vme->u.inst_emul.paging;
 1627         cpu_mode = paging->cpu_mode;
 1628 
 1629         VMM_CTR1(vcpu, "inst_emul fault accessing gpa %#lx", gpa);
 1630 
 1631         /* Fetch, decode and emulate the faulting instruction */
 1632         if (vie->num_valid == 0) {
 1633                 error = vmm_fetch_instruction(vcpu, paging, vme->rip + cs_base,
 1634                     VIE_INST_SIZE, vie, &fault);
 1635         } else {
 1636                 /*
 1637                  * The instruction bytes have already been copied into 'vie'
 1638                  */
 1639                 error = fault = 0;
 1640         }
 1641         if (error || fault)
 1642                 return (error);
 1643 
 1644         if (vmm_decode_instruction(vcpu, gla, cpu_mode, cs_d, vie) != 0) {
 1645                 VMM_CTR1(vcpu, "Error decoding instruction at %#lx",
 1646                     vme->rip + cs_base);
 1647                 *retu = true;       /* dump instruction bytes in userspace */
 1648                 return (0);
 1649         }
 1650 
 1651         /*
 1652          * Update 'nextrip' based on the length of the emulated instruction.
 1653          */
 1654         vme->inst_length = vie->num_processed;
 1655         vcpu->nextrip += vie->num_processed;
 1656         VMM_CTR1(vcpu, "nextrip updated to %#lx after instruction decoding",
 1657             vcpu->nextrip);
 1658 
 1659         /* return to userland unless this is an in-kernel emulated device */
 1660         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 1661                 mread = lapic_mmio_read;
 1662                 mwrite = lapic_mmio_write;
 1663         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 1664                 mread = vioapic_mmio_read;
 1665                 mwrite = vioapic_mmio_write;
 1666         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 1667                 mread = vhpet_mmio_read;
 1668                 mwrite = vhpet_mmio_write;
 1669         } else {
 1670                 *retu = true;
 1671                 return (0);
 1672         }
 1673 
 1674         error = vmm_emulate_instruction(vcpu, gpa, vie, paging, mread, mwrite,
 1675             retu);
 1676 
 1677         return (error);
 1678 }
 1679 
 1680 static int
 1681 vm_handle_suspend(struct vcpu *vcpu, bool *retu)
 1682 {
 1683         struct vm *vm = vcpu->vm;
 1684         int error, i;
 1685         struct thread *td;
 1686 
 1687         error = 0;
 1688         td = curthread;
 1689 
 1690         CPU_SET_ATOMIC(vcpu->vcpuid, &vm->suspended_cpus);
 1691 
 1692         /*
 1693          * Wait until all 'active_cpus' have suspended themselves.
 1694          *
 1695          * Since a VM may be suspended at any time including when one or
 1696          * more vcpus are doing a rendezvous we need to call the rendezvous
 1697          * handler while we are waiting to prevent a deadlock.
 1698          */
 1699         vcpu_lock(vcpu);
 1700         while (error == 0) {
 1701                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 1702                         VMM_CTR0(vcpu, "All vcpus suspended");
 1703                         break;
 1704                 }
 1705 
 1706                 if (vm->rendezvous_func == NULL) {
 1707                         VMM_CTR0(vcpu, "Sleeping during suspend");
 1708                         vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 1709                         msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 1710                         vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 1711                         if (td_ast_pending(td, TDA_SUSPEND)) {
 1712                                 vcpu_unlock(vcpu);
 1713                                 error = thread_check_susp(td, false);
 1714                                 vcpu_lock(vcpu);
 1715                         }
 1716                 } else {
 1717                         VMM_CTR0(vcpu, "Rendezvous during suspend");
 1718                         vcpu_unlock(vcpu);
 1719                         error = vm_handle_rendezvous(vcpu);
 1720                         vcpu_lock(vcpu);
 1721                 }
 1722         }
 1723         vcpu_unlock(vcpu);
 1724 
 1725         /*
 1726          * Wakeup the other sleeping vcpus and return to userspace.
 1727          */
 1728         for (i = 0; i < vm->maxcpus; i++) {
 1729                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
 1730                         vcpu_notify_event(vm_vcpu(vm, i), false);
 1731                 }
 1732         }
 1733 
 1734         *retu = true;
 1735         return (error);
 1736 }
 1737 
 1738 static int
 1739 vm_handle_reqidle(struct vcpu *vcpu, bool *retu)
 1740 {
 1741         vcpu_lock(vcpu);
 1742         KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
 1743         vcpu->reqidle = 0;
 1744         vcpu_unlock(vcpu);
 1745         *retu = true;
 1746         return (0);
 1747 }
 1748 
 1749 int
 1750 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 1751 {
 1752         int i;
 1753 
 1754         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 1755                 return (EINVAL);
 1756 
 1757         if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 1758                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
 1759                     vm->suspend, how);
 1760                 return (EALREADY);
 1761         }
 1762 
 1763         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 1764 
 1765         /*
 1766          * Notify all active vcpus that they are now suspended.
 1767          */
 1768         for (i = 0; i < vm->maxcpus; i++) {
 1769                 if (CPU_ISSET(i, &vm->active_cpus))
 1770                         vcpu_notify_event(vm_vcpu(vm, i), false);
 1771         }
 1772 
 1773         return (0);
 1774 }
 1775 
 1776 void
 1777 vm_exit_suspended(struct vcpu *vcpu, uint64_t rip)
 1778 {
 1779         struct vm *vm = vcpu->vm;
 1780         struct vm_exit *vmexit;
 1781 
 1782         KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 1783             ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 1784 
 1785         vmexit = vm_exitinfo(vcpu);
 1786         vmexit->rip = rip;
 1787         vmexit->inst_length = 0;
 1788         vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 1789         vmexit->u.suspended.how = vm->suspend;
 1790 }
 1791 
 1792 void
 1793 vm_exit_debug(struct vcpu *vcpu, uint64_t rip)
 1794 {
 1795         struct vm_exit *vmexit;
 1796 
 1797         vmexit = vm_exitinfo(vcpu);
 1798         vmexit->rip = rip;
 1799         vmexit->inst_length = 0;
 1800         vmexit->exitcode = VM_EXITCODE_DEBUG;
 1801 }
 1802 
 1803 void
 1804 vm_exit_rendezvous(struct vcpu *vcpu, uint64_t rip)
 1805 {
 1806         struct vm_exit *vmexit;
 1807 
 1808         vmexit = vm_exitinfo(vcpu);
 1809         vmexit->rip = rip;
 1810         vmexit->inst_length = 0;
 1811         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 1812         vmm_stat_incr(vcpu, VMEXIT_RENDEZVOUS, 1);
 1813 }
 1814 
 1815 void
 1816 vm_exit_reqidle(struct vcpu *vcpu, uint64_t rip)
 1817 {
 1818         struct vm_exit *vmexit;
 1819 
 1820         vmexit = vm_exitinfo(vcpu);
 1821         vmexit->rip = rip;
 1822         vmexit->inst_length = 0;
 1823         vmexit->exitcode = VM_EXITCODE_REQIDLE;
 1824         vmm_stat_incr(vcpu, VMEXIT_REQIDLE, 1);
 1825 }
 1826 
 1827 void
 1828 vm_exit_astpending(struct vcpu *vcpu, uint64_t rip)
 1829 {
 1830         struct vm_exit *vmexit;
 1831 
 1832         vmexit = vm_exitinfo(vcpu);
 1833         vmexit->rip = rip;
 1834         vmexit->inst_length = 0;
 1835         vmexit->exitcode = VM_EXITCODE_BOGUS;
 1836         vmm_stat_incr(vcpu, VMEXIT_ASTPENDING, 1);
 1837 }
 1838 
 1839 int
 1840 vm_run(struct vcpu *vcpu, struct vm_exit *vme_user)
 1841 {
 1842         struct vm *vm = vcpu->vm;
 1843         struct vm_eventinfo evinfo;
 1844         int error, vcpuid;
 1845         struct pcb *pcb;
 1846         uint64_t tscval;
 1847         struct vm_exit *vme;
 1848         bool retu, intr_disabled;
 1849         pmap_t pmap;
 1850 
 1851         vcpuid = vcpu->vcpuid;
 1852 
 1853         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 1854                 return (EINVAL);
 1855 
 1856         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 1857                 return (EINVAL);
 1858 
 1859         pmap = vmspace_pmap(vm->vmspace);
 1860         vme = &vcpu->exitinfo;
 1861         evinfo.rptr = &vm->rendezvous_func;
 1862         evinfo.sptr = &vm->suspend;
 1863         evinfo.iptr = &vcpu->reqidle;
 1864 restart:
 1865         critical_enter();
 1866 
 1867         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 1868             ("vm_run: absurd pm_active"));
 1869 
 1870         tscval = rdtsc();
 1871 
 1872         pcb = PCPU_GET(curpcb);
 1873         set_pcb_flags(pcb, PCB_FULL_IRET);
 1874 
 1875         restore_guest_fpustate(vcpu);
 1876 
 1877         vcpu_require_state(vcpu, VCPU_RUNNING);
 1878         error = vmmops_run(vcpu->cookie, vcpu->nextrip, pmap, &evinfo);
 1879         vcpu_require_state(vcpu, VCPU_FROZEN);
 1880 
 1881         save_guest_fpustate(vcpu);
 1882 
 1883         vmm_stat_incr(vcpu, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 1884 
 1885         critical_exit();
 1886 
 1887         if (error == 0) {
 1888                 retu = false;
 1889                 vcpu->nextrip = vme->rip + vme->inst_length;
 1890                 switch (vme->exitcode) {
 1891                 case VM_EXITCODE_REQIDLE:
 1892                         error = vm_handle_reqidle(vcpu, &retu);
 1893                         break;
 1894                 case VM_EXITCODE_SUSPENDED:
 1895                         error = vm_handle_suspend(vcpu, &retu);
 1896                         break;
 1897                 case VM_EXITCODE_IOAPIC_EOI:
 1898                         vioapic_process_eoi(vm, vme->u.ioapic_eoi.vector);
 1899                         break;
 1900                 case VM_EXITCODE_RENDEZVOUS:
 1901                         error = vm_handle_rendezvous(vcpu);
 1902                         break;
 1903                 case VM_EXITCODE_HLT:
 1904                         intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 1905                         error = vm_handle_hlt(vcpu, intr_disabled, &retu);
 1906                         break;
 1907                 case VM_EXITCODE_PAGING:
 1908                         error = vm_handle_paging(vcpu, &retu);
 1909                         break;
 1910                 case VM_EXITCODE_INST_EMUL:
 1911                         error = vm_handle_inst_emul(vcpu, &retu);
 1912                         break;
 1913                 case VM_EXITCODE_INOUT:
 1914                 case VM_EXITCODE_INOUT_STR:
 1915                         error = vm_handle_inout(vcpu, vme, &retu);
 1916                         break;
 1917                 case VM_EXITCODE_MONITOR:
 1918                 case VM_EXITCODE_MWAIT:
 1919                 case VM_EXITCODE_VMINSN:
 1920                         vm_inject_ud(vcpu);
 1921                         break;
 1922                 default:
 1923                         retu = true;    /* handled in userland */
 1924                         break;
 1925                 }
 1926         }
 1927 
 1928         /*
 1929          * VM_EXITCODE_INST_EMUL could access the apic which could transform the
 1930          * exit code into VM_EXITCODE_IPI.
 1931          */
 1932         if (error == 0 && vme->exitcode == VM_EXITCODE_IPI) {
 1933                 retu = false;
 1934                 error = vm_handle_ipi(vcpu, vme, &retu);
 1935         }
 1936 
 1937         if (error == 0 && retu == false)
 1938                 goto restart;
 1939 
 1940         vmm_stat_incr(vcpu, VMEXIT_USERSPACE, 1);
 1941         VMM_CTR2(vcpu, "retu %d/%d", error, vme->exitcode);
 1942 
 1943         /* copy the exit information */
 1944         *vme_user = *vme;
 1945         return (error);
 1946 }
 1947 
 1948 int
 1949 vm_restart_instruction(struct vcpu *vcpu)
 1950 {
 1951         enum vcpu_state state;
 1952         uint64_t rip;
 1953         int error __diagused;
 1954 
 1955         state = vcpu_get_state(vcpu, NULL);
 1956         if (state == VCPU_RUNNING) {
 1957                 /*
 1958                  * When a vcpu is "running" the next instruction is determined
 1959                  * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
 1960                  * Thus setting 'inst_length' to zero will cause the current
 1961                  * instruction to be restarted.
 1962                  */
 1963                 vcpu->exitinfo.inst_length = 0;
 1964                 VMM_CTR1(vcpu, "restarting instruction at %#lx by "
 1965                     "setting inst_length to zero", vcpu->exitinfo.rip);
 1966         } else if (state == VCPU_FROZEN) {
 1967                 /*
 1968                  * When a vcpu is "frozen" it is outside the critical section
 1969                  * around vmmops_run() and 'nextrip' points to the next
 1970                  * instruction. Thus instruction restart is achieved by setting
 1971                  * 'nextrip' to the vcpu's %rip.
 1972                  */
 1973                 error = vm_get_register(vcpu, VM_REG_GUEST_RIP, &rip);
 1974                 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
 1975                 VMM_CTR2(vcpu, "restarting instruction by updating "
 1976                     "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
 1977                 vcpu->nextrip = rip;
 1978         } else {
 1979                 panic("%s: invalid state %d", __func__, state);
 1980         }
 1981         return (0);
 1982 }
 1983 
 1984 int
 1985 vm_exit_intinfo(struct vcpu *vcpu, uint64_t info)
 1986 {
 1987         int type, vector;
 1988 
 1989         if (info & VM_INTINFO_VALID) {
 1990                 type = info & VM_INTINFO_TYPE;
 1991                 vector = info & 0xff;
 1992                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 1993                         return (EINVAL);
 1994                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 1995                         return (EINVAL);
 1996                 if (info & VM_INTINFO_RSVD)
 1997                         return (EINVAL);
 1998         } else {
 1999                 info = 0;
 2000         }
 2001         VMM_CTR2(vcpu, "%s: info1(%#lx)", __func__, info);
 2002         vcpu->exitintinfo = info;
 2003         return (0);
 2004 }
 2005 
 2006 enum exc_class {
 2007         EXC_BENIGN,
 2008         EXC_CONTRIBUTORY,
 2009         EXC_PAGEFAULT
 2010 };
 2011 
 2012 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
 2013 
 2014 static enum exc_class
 2015 exception_class(uint64_t info)
 2016 {
 2017         int type, vector;
 2018 
 2019         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 2020         type = info & VM_INTINFO_TYPE;
 2021         vector = info & 0xff;
 2022 
 2023         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 2024         switch (type) {
 2025         case VM_INTINFO_HWINTR:
 2026         case VM_INTINFO_SWINTR:
 2027         case VM_INTINFO_NMI:
 2028                 return (EXC_BENIGN);
 2029         default:
 2030                 /*
 2031                  * Hardware exception.
 2032                  *
 2033                  * SVM and VT-x use identical type values to represent NMI,
 2034                  * hardware interrupt and software interrupt.
 2035                  *
 2036                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
 2037                  * for exceptions except #BP and #OF. #BP and #OF use a type
 2038                  * value of '5' or '6'. Therefore we don't check for explicit
 2039                  * values of 'type' to classify 'intinfo' into a hardware
 2040                  * exception.
 2041                  */
 2042                 break;
 2043         }
 2044 
 2045         switch (vector) {
 2046         case IDT_PF:
 2047         case IDT_VE:
 2048                 return (EXC_PAGEFAULT);
 2049         case IDT_DE:
 2050         case IDT_TS:
 2051         case IDT_NP:
 2052         case IDT_SS:
 2053         case IDT_GP:
 2054                 return (EXC_CONTRIBUTORY);
 2055         default:
 2056                 return (EXC_BENIGN);
 2057         }
 2058 }
 2059 
 2060 static int
 2061 nested_fault(struct vcpu *vcpu, uint64_t info1, uint64_t info2,
 2062     uint64_t *retinfo)
 2063 {
 2064         enum exc_class exc1, exc2;
 2065         int type1, vector1;
 2066 
 2067         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 2068         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 2069 
 2070         /*
 2071          * If an exception occurs while attempting to call the double-fault
 2072          * handler the processor enters shutdown mode (aka triple fault).
 2073          */
 2074         type1 = info1 & VM_INTINFO_TYPE;
 2075         vector1 = info1 & 0xff;
 2076         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 2077                 VMM_CTR2(vcpu, "triple fault: info1(%#lx), info2(%#lx)",
 2078                     info1, info2);
 2079                 vm_suspend(vcpu->vm, VM_SUSPEND_TRIPLEFAULT);
 2080                 *retinfo = 0;
 2081                 return (0);
 2082         }
 2083 
 2084         /*
 2085          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 2086          */
 2087         exc1 = exception_class(info1);
 2088         exc2 = exception_class(info2);
 2089         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 2090             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 2091                 /* Convert nested fault into a double fault. */
 2092                 *retinfo = IDT_DF;
 2093                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 2094                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
 2095         } else {
 2096                 /* Handle exceptions serially */
 2097                 *retinfo = info2;
 2098         }
 2099         return (1);
 2100 }
 2101 
 2102 static uint64_t
 2103 vcpu_exception_intinfo(struct vcpu *vcpu)
 2104 {
 2105         uint64_t info = 0;
 2106 
 2107         if (vcpu->exception_pending) {
 2108                 info = vcpu->exc_vector & 0xff;
 2109                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 2110                 if (vcpu->exc_errcode_valid) {
 2111                         info |= VM_INTINFO_DEL_ERRCODE;
 2112                         info |= (uint64_t)vcpu->exc_errcode << 32;
 2113                 }
 2114         }
 2115         return (info);
 2116 }
 2117 
 2118 int
 2119 vm_entry_intinfo(struct vcpu *vcpu, uint64_t *retinfo)
 2120 {
 2121         uint64_t info1, info2;
 2122         int valid;
 2123 
 2124         info1 = vcpu->exitintinfo;
 2125         vcpu->exitintinfo = 0;
 2126 
 2127         info2 = 0;
 2128         if (vcpu->exception_pending) {
 2129                 info2 = vcpu_exception_intinfo(vcpu);
 2130                 vcpu->exception_pending = 0;
 2131                 VMM_CTR2(vcpu, "Exception %d delivered: %#lx",
 2132                     vcpu->exc_vector, info2);
 2133         }
 2134 
 2135         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 2136                 valid = nested_fault(vcpu, info1, info2, retinfo);
 2137         } else if (info1 & VM_INTINFO_VALID) {
 2138                 *retinfo = info1;
 2139                 valid = 1;
 2140         } else if (info2 & VM_INTINFO_VALID) {
 2141                 *retinfo = info2;
 2142                 valid = 1;
 2143         } else {
 2144                 valid = 0;
 2145         }
 2146 
 2147         if (valid) {
 2148                 VMM_CTR4(vcpu, "%s: info1(%#lx), info2(%#lx), "
 2149                     "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 2150         }
 2151 
 2152         return (valid);
 2153 }
 2154 
 2155 int
 2156 vm_get_intinfo(struct vcpu *vcpu, uint64_t *info1, uint64_t *info2)
 2157 {
 2158         *info1 = vcpu->exitintinfo;
 2159         *info2 = vcpu_exception_intinfo(vcpu);
 2160         return (0);
 2161 }
 2162 
 2163 int
 2164 vm_inject_exception(struct vcpu *vcpu, int vector, int errcode_valid,
 2165     uint32_t errcode, int restart_instruction)
 2166 {
 2167         uint64_t regval;
 2168         int error __diagused;
 2169 
 2170         if (vector < 0 || vector >= 32)
 2171                 return (EINVAL);
 2172 
 2173         /*
 2174          * A double fault exception should never be injected directly into
 2175          * the guest. It is a derived exception that results from specific
 2176          * combinations of nested faults.
 2177          */
 2178         if (vector == IDT_DF)
 2179                 return (EINVAL);
 2180 
 2181         if (vcpu->exception_pending) {
 2182                 VMM_CTR2(vcpu, "Unable to inject exception %d due to "
 2183                     "pending exception %d", vector, vcpu->exc_vector);
 2184                 return (EBUSY);
 2185         }
 2186 
 2187         if (errcode_valid) {
 2188                 /*
 2189                  * Exceptions don't deliver an error code in real mode.
 2190                  */
 2191                 error = vm_get_register(vcpu, VM_REG_GUEST_CR0, &regval);
 2192                 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
 2193                 if (!(regval & CR0_PE))
 2194                         errcode_valid = 0;
 2195         }
 2196 
 2197         /*
 2198          * From section 26.6.1 "Interruptibility State" in Intel SDM:
 2199          *
 2200          * Event blocking by "STI" or "MOV SS" is cleared after guest executes
 2201          * one instruction or incurs an exception.
 2202          */
 2203         error = vm_set_register(vcpu, VM_REG_GUEST_INTR_SHADOW, 0);
 2204         KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
 2205             __func__, error));
 2206 
 2207         if (restart_instruction)
 2208                 vm_restart_instruction(vcpu);
 2209 
 2210         vcpu->exception_pending = 1;
 2211         vcpu->exc_vector = vector;
 2212         vcpu->exc_errcode = errcode;
 2213         vcpu->exc_errcode_valid = errcode_valid;
 2214         VMM_CTR1(vcpu, "Exception %d pending", vector);
 2215         return (0);
 2216 }
 2217 
 2218 void
 2219 vm_inject_fault(struct vcpu *vcpu, int vector, int errcode_valid, int errcode)
 2220 {
 2221         int error __diagused, restart_instruction;
 2222 
 2223         restart_instruction = 1;
 2224 
 2225         error = vm_inject_exception(vcpu, vector, errcode_valid,
 2226             errcode, restart_instruction);
 2227         KASSERT(error == 0, ("vm_inject_exception error %d", error));
 2228 }
 2229 
 2230 void
 2231 vm_inject_pf(struct vcpu *vcpu, int error_code, uint64_t cr2)
 2232 {
 2233         int error __diagused;
 2234 
 2235         VMM_CTR2(vcpu, "Injecting page fault: error_code %#x, cr2 %#lx",
 2236             error_code, cr2);
 2237 
 2238         error = vm_set_register(vcpu, VM_REG_GUEST_CR2, cr2);
 2239         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 2240 
 2241         vm_inject_fault(vcpu, IDT_PF, 1, error_code);
 2242 }
 2243 
 2244 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 2245 
 2246 int
 2247 vm_inject_nmi(struct vcpu *vcpu)
 2248 {
 2249 
 2250         vcpu->nmi_pending = 1;
 2251         vcpu_notify_event(vcpu, false);
 2252         return (0);
 2253 }
 2254 
 2255 int
 2256 vm_nmi_pending(struct vcpu *vcpu)
 2257 {
 2258         return (vcpu->nmi_pending);
 2259 }
 2260 
 2261 void
 2262 vm_nmi_clear(struct vcpu *vcpu)
 2263 {
 2264         if (vcpu->nmi_pending == 0)
 2265                 panic("vm_nmi_clear: inconsistent nmi_pending state");
 2266 
 2267         vcpu->nmi_pending = 0;
 2268         vmm_stat_incr(vcpu, VCPU_NMI_COUNT, 1);
 2269 }
 2270 
 2271 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 2272 
 2273 int
 2274 vm_inject_extint(struct vcpu *vcpu)
 2275 {
 2276 
 2277         vcpu->extint_pending = 1;
 2278         vcpu_notify_event(vcpu, false);
 2279         return (0);
 2280 }
 2281 
 2282 int
 2283 vm_extint_pending(struct vcpu *vcpu)
 2284 {
 2285         return (vcpu->extint_pending);
 2286 }
 2287 
 2288 void
 2289 vm_extint_clear(struct vcpu *vcpu)
 2290 {
 2291         if (vcpu->extint_pending == 0)
 2292                 panic("vm_extint_clear: inconsistent extint_pending state");
 2293 
 2294         vcpu->extint_pending = 0;
 2295         vmm_stat_incr(vcpu, VCPU_EXTINT_COUNT, 1);
 2296 }
 2297 
 2298 int
 2299 vm_get_capability(struct vcpu *vcpu, int type, int *retval)
 2300 {
 2301         if (type < 0 || type >= VM_CAP_MAX)
 2302                 return (EINVAL);
 2303 
 2304         return (vmmops_getcap(vcpu->cookie, type, retval));
 2305 }
 2306 
 2307 int
 2308 vm_set_capability(struct vcpu *vcpu, int type, int val)
 2309 {
 2310         if (type < 0 || type >= VM_CAP_MAX)
 2311                 return (EINVAL);
 2312 
 2313         return (vmmops_setcap(vcpu->cookie, type, val));
 2314 }
 2315 
 2316 struct vm *
 2317 vcpu_vm(struct vcpu *vcpu)
 2318 {
 2319         return (vcpu->vm);
 2320 }
 2321 
 2322 int
 2323 vcpu_vcpuid(struct vcpu *vcpu)
 2324 {
 2325         return (vcpu->vcpuid);
 2326 }
 2327 
 2328 struct vcpu *
 2329 vm_vcpu(struct vm *vm, int vcpuid)
 2330 {
 2331         return (vm->vcpu[vcpuid]);
 2332 }
 2333 
 2334 struct vlapic *
 2335 vm_lapic(struct vcpu *vcpu)
 2336 {
 2337         return (vcpu->vlapic);
 2338 }
 2339 
 2340 struct vioapic *
 2341 vm_ioapic(struct vm *vm)
 2342 {
 2343 
 2344         return (vm->vioapic);
 2345 }
 2346 
 2347 struct vhpet *
 2348 vm_hpet(struct vm *vm)
 2349 {
 2350 
 2351         return (vm->vhpet);
 2352 }
 2353 
 2354 bool
 2355 vmm_is_pptdev(int bus, int slot, int func)
 2356 {
 2357         int b, f, i, n, s;
 2358         char *val, *cp, *cp2;
 2359         bool found;
 2360 
 2361         /*
 2362          * XXX
 2363          * The length of an environment variable is limited to 128 bytes which
 2364          * puts an upper limit on the number of passthru devices that may be
 2365          * specified using a single environment variable.
 2366          *
 2367          * Work around this by scanning multiple environment variable
 2368          * names instead of a single one - yuck!
 2369          */
 2370         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 2371 
 2372         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 2373         found = false;
 2374         for (i = 0; names[i] != NULL && !found; i++) {
 2375                 cp = val = kern_getenv(names[i]);
 2376                 while (cp != NULL && *cp != '\0') {
 2377                         if ((cp2 = strchr(cp, ' ')) != NULL)
 2378                                 *cp2 = '\0';
 2379 
 2380                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 2381                         if (n == 3 && bus == b && slot == s && func == f) {
 2382                                 found = true;
 2383                                 break;
 2384                         }
 2385                 
 2386                         if (cp2 != NULL)
 2387                                 *cp2++ = ' ';
 2388 
 2389                         cp = cp2;
 2390                 }
 2391                 freeenv(val);
 2392         }
 2393         return (found);
 2394 }
 2395 
 2396 void *
 2397 vm_iommu_domain(struct vm *vm)
 2398 {
 2399 
 2400         return (vm->iommu);
 2401 }
 2402 
 2403 int
 2404 vcpu_set_state(struct vcpu *vcpu, enum vcpu_state newstate, bool from_idle)
 2405 {
 2406         int error;
 2407 
 2408         vcpu_lock(vcpu);
 2409         error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 2410         vcpu_unlock(vcpu);
 2411 
 2412         return (error);
 2413 }
 2414 
 2415 enum vcpu_state
 2416 vcpu_get_state(struct vcpu *vcpu, int *hostcpu)
 2417 {
 2418         enum vcpu_state state;
 2419 
 2420         vcpu_lock(vcpu);
 2421         state = vcpu->state;
 2422         if (hostcpu != NULL)
 2423                 *hostcpu = vcpu->hostcpu;
 2424         vcpu_unlock(vcpu);
 2425 
 2426         return (state);
 2427 }
 2428 
 2429 int
 2430 vm_activate_cpu(struct vcpu *vcpu)
 2431 {
 2432         struct vm *vm = vcpu->vm;
 2433 
 2434         if (CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 2435                 return (EBUSY);
 2436 
 2437         VMM_CTR0(vcpu, "activated");
 2438         CPU_SET_ATOMIC(vcpu->vcpuid, &vm->active_cpus);
 2439         return (0);
 2440 }
 2441 
 2442 int
 2443 vm_suspend_cpu(struct vm *vm, struct vcpu *vcpu)
 2444 {
 2445         if (vcpu == NULL) {
 2446                 vm->debug_cpus = vm->active_cpus;
 2447                 for (int i = 0; i < vm->maxcpus; i++) {
 2448                         if (CPU_ISSET(i, &vm->active_cpus))
 2449                                 vcpu_notify_event(vm_vcpu(vm, i), false);
 2450                 }
 2451         } else {
 2452                 if (!CPU_ISSET(vcpu->vcpuid, &vm->active_cpus))
 2453                         return (EINVAL);
 2454 
 2455                 CPU_SET_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 2456                 vcpu_notify_event(vcpu, false);
 2457         }
 2458         return (0);
 2459 }
 2460 
 2461 int
 2462 vm_resume_cpu(struct vm *vm, struct vcpu *vcpu)
 2463 {
 2464 
 2465         if (vcpu == NULL) {
 2466                 CPU_ZERO(&vm->debug_cpus);
 2467         } else {
 2468                 if (!CPU_ISSET(vcpu->vcpuid, &vm->debug_cpus))
 2469                         return (EINVAL);
 2470 
 2471                 CPU_CLR_ATOMIC(vcpu->vcpuid, &vm->debug_cpus);
 2472         }
 2473         return (0);
 2474 }
 2475 
 2476 int
 2477 vcpu_debugged(struct vcpu *vcpu)
 2478 {
 2479 
 2480         return (CPU_ISSET(vcpu->vcpuid, &vcpu->vm->debug_cpus));
 2481 }
 2482 
 2483 cpuset_t
 2484 vm_active_cpus(struct vm *vm)
 2485 {
 2486 
 2487         return (vm->active_cpus);
 2488 }
 2489 
 2490 cpuset_t
 2491 vm_debug_cpus(struct vm *vm)
 2492 {
 2493 
 2494         return (vm->debug_cpus);
 2495 }
 2496 
 2497 cpuset_t
 2498 vm_suspended_cpus(struct vm *vm)
 2499 {
 2500 
 2501         return (vm->suspended_cpus);
 2502 }
 2503 
 2504 /*
 2505  * Returns the subset of vCPUs in tostart that are awaiting startup.
 2506  * These vCPUs are also marked as no longer awaiting startup.
 2507  */
 2508 cpuset_t
 2509 vm_start_cpus(struct vm *vm, const cpuset_t *tostart)
 2510 {
 2511         cpuset_t set;
 2512 
 2513         mtx_lock(&vm->rendezvous_mtx);
 2514         CPU_AND(&set, &vm->startup_cpus, tostart);
 2515         CPU_ANDNOT(&vm->startup_cpus, &vm->startup_cpus, &set);
 2516         mtx_unlock(&vm->rendezvous_mtx);
 2517         return (set);
 2518 }
 2519 
 2520 void
 2521 vm_await_start(struct vm *vm, const cpuset_t *waiting)
 2522 {
 2523         mtx_lock(&vm->rendezvous_mtx);
 2524         CPU_OR(&vm->startup_cpus, &vm->startup_cpus, waiting);
 2525         mtx_unlock(&vm->rendezvous_mtx);
 2526 }
 2527 
 2528 void *
 2529 vcpu_stats(struct vcpu *vcpu)
 2530 {
 2531 
 2532         return (vcpu->stats);
 2533 }
 2534 
 2535 int
 2536 vm_get_x2apic_state(struct vcpu *vcpu, enum x2apic_state *state)
 2537 {
 2538         *state = vcpu->x2apic_state;
 2539 
 2540         return (0);
 2541 }
 2542 
 2543 int
 2544 vm_set_x2apic_state(struct vcpu *vcpu, enum x2apic_state state)
 2545 {
 2546         if (state >= X2APIC_STATE_LAST)
 2547                 return (EINVAL);
 2548 
 2549         vcpu->x2apic_state = state;
 2550 
 2551         vlapic_set_x2apic_state(vcpu, state);
 2552 
 2553         return (0);
 2554 }
 2555 
 2556 /*
 2557  * This function is called to ensure that a vcpu "sees" a pending event
 2558  * as soon as possible:
 2559  * - If the vcpu thread is sleeping then it is woken up.
 2560  * - If the vcpu is running on a different host_cpu then an IPI will be directed
 2561  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
 2562  */
 2563 static void
 2564 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 2565 {
 2566         int hostcpu;
 2567 
 2568         hostcpu = vcpu->hostcpu;
 2569         if (vcpu->state == VCPU_RUNNING) {
 2570                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 2571                 if (hostcpu != curcpu) {
 2572                         if (lapic_intr) {
 2573                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
 2574                                     vmm_ipinum);
 2575                         } else {
 2576                                 ipi_cpu(hostcpu, vmm_ipinum);
 2577                         }
 2578                 } else {
 2579                         /*
 2580                          * If the 'vcpu' is running on 'curcpu' then it must
 2581                          * be sending a notification to itself (e.g. SELF_IPI).
 2582                          * The pending event will be picked up when the vcpu
 2583                          * transitions back to guest context.
 2584                          */
 2585                 }
 2586         } else {
 2587                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 2588                     "with hostcpu %d", vcpu->state, hostcpu));
 2589                 if (vcpu->state == VCPU_SLEEPING)
 2590                         wakeup_one(vcpu);
 2591         }
 2592 }
 2593 
 2594 void
 2595 vcpu_notify_event(struct vcpu *vcpu, bool lapic_intr)
 2596 {
 2597         vcpu_lock(vcpu);
 2598         vcpu_notify_event_locked(vcpu, lapic_intr);
 2599         vcpu_unlock(vcpu);
 2600 }
 2601 
 2602 struct vmspace *
 2603 vm_get_vmspace(struct vm *vm)
 2604 {
 2605 
 2606         return (vm->vmspace);
 2607 }
 2608 
 2609 int
 2610 vm_apicid2vcpuid(struct vm *vm, int apicid)
 2611 {
 2612         /*
 2613          * XXX apic id is assumed to be numerically identical to vcpu id
 2614          */
 2615         return (apicid);
 2616 }
 2617 
 2618 int
 2619 vm_smp_rendezvous(struct vcpu *vcpu, cpuset_t dest,
 2620     vm_rendezvous_func_t func, void *arg)
 2621 {
 2622         struct vm *vm = vcpu->vm;
 2623         int error, i;
 2624 
 2625         /*
 2626          * Enforce that this function is called without any locks
 2627          */
 2628         WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 2629 
 2630 restart:
 2631         mtx_lock(&vm->rendezvous_mtx);
 2632         if (vm->rendezvous_func != NULL) {
 2633                 /*
 2634                  * If a rendezvous is already in progress then we need to
 2635                  * call the rendezvous handler in case this 'vcpu' is one
 2636                  * of the targets of the rendezvous.
 2637                  */
 2638                 VMM_CTR0(vcpu, "Rendezvous already in progress");
 2639                 mtx_unlock(&vm->rendezvous_mtx);
 2640                 error = vm_handle_rendezvous(vcpu);
 2641                 if (error != 0)
 2642                         return (error);
 2643                 goto restart;
 2644         }
 2645         KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 2646             "rendezvous is still in progress"));
 2647 
 2648         VMM_CTR0(vcpu, "Initiating rendezvous");
 2649         vm->rendezvous_req_cpus = dest;
 2650         CPU_ZERO(&vm->rendezvous_done_cpus);
 2651         vm->rendezvous_arg = arg;
 2652         vm->rendezvous_func = func;
 2653         mtx_unlock(&vm->rendezvous_mtx);
 2654 
 2655         /*
 2656          * Wake up any sleeping vcpus and trigger a VM-exit in any running
 2657          * vcpus so they handle the rendezvous as soon as possible.
 2658          */
 2659         for (i = 0; i < vm->maxcpus; i++) {
 2660                 if (CPU_ISSET(i, &dest))
 2661                         vcpu_notify_event(vm_vcpu(vm, i), false);
 2662         }
 2663 
 2664         return (vm_handle_rendezvous(vcpu));
 2665 }
 2666 
 2667 struct vatpic *
 2668 vm_atpic(struct vm *vm)
 2669 {
 2670         return (vm->vatpic);
 2671 }
 2672 
 2673 struct vatpit *
 2674 vm_atpit(struct vm *vm)
 2675 {
 2676         return (vm->vatpit);
 2677 }
 2678 
 2679 struct vpmtmr *
 2680 vm_pmtmr(struct vm *vm)
 2681 {
 2682 
 2683         return (vm->vpmtmr);
 2684 }
 2685 
 2686 struct vrtc *
 2687 vm_rtc(struct vm *vm)
 2688 {
 2689 
 2690         return (vm->vrtc);
 2691 }
 2692 
 2693 enum vm_reg_name
 2694 vm_segment_name(int seg)
 2695 {
 2696         static enum vm_reg_name seg_names[] = {
 2697                 VM_REG_GUEST_ES,
 2698                 VM_REG_GUEST_CS,
 2699                 VM_REG_GUEST_SS,
 2700                 VM_REG_GUEST_DS,
 2701                 VM_REG_GUEST_FS,
 2702                 VM_REG_GUEST_GS
 2703         };
 2704 
 2705         KASSERT(seg >= 0 && seg < nitems(seg_names),
 2706             ("%s: invalid segment encoding %d", __func__, seg));
 2707         return (seg_names[seg]);
 2708 }
 2709 
 2710 void
 2711 vm_copy_teardown(struct vm_copyinfo *copyinfo, int num_copyinfo)
 2712 {
 2713         int idx;
 2714 
 2715         for (idx = 0; idx < num_copyinfo; idx++) {
 2716                 if (copyinfo[idx].cookie != NULL)
 2717                         vm_gpa_release(copyinfo[idx].cookie);
 2718         }
 2719         bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 2720 }
 2721 
 2722 int
 2723 vm_copy_setup(struct vcpu *vcpu, struct vm_guest_paging *paging,
 2724     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
 2725     int num_copyinfo, int *fault)
 2726 {
 2727         int error, idx, nused;
 2728         size_t n, off, remaining;
 2729         void *hva, *cookie;
 2730         uint64_t gpa;
 2731 
 2732         bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 2733 
 2734         nused = 0;
 2735         remaining = len;
 2736         while (remaining > 0) {
 2737                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 2738                 error = vm_gla2gpa(vcpu, paging, gla, prot, &gpa, fault);
 2739                 if (error || *fault)
 2740                         return (error);
 2741                 off = gpa & PAGE_MASK;
 2742                 n = min(remaining, PAGE_SIZE - off);
 2743                 copyinfo[nused].gpa = gpa;
 2744                 copyinfo[nused].len = n;
 2745                 remaining -= n;
 2746                 gla += n;
 2747                 nused++;
 2748         }
 2749 
 2750         for (idx = 0; idx < nused; idx++) {
 2751                 hva = vm_gpa_hold(vcpu, copyinfo[idx].gpa,
 2752                     copyinfo[idx].len, prot, &cookie);
 2753                 if (hva == NULL)
 2754                         break;
 2755                 copyinfo[idx].hva = hva;
 2756                 copyinfo[idx].cookie = cookie;
 2757         }
 2758 
 2759         if (idx != nused) {
 2760                 vm_copy_teardown(copyinfo, num_copyinfo);
 2761                 return (EFAULT);
 2762         } else {
 2763                 *fault = 0;
 2764                 return (0);
 2765         }
 2766 }
 2767 
 2768 void
 2769 vm_copyin(struct vm_copyinfo *copyinfo, void *kaddr, size_t len)
 2770 {
 2771         char *dst;
 2772         int idx;
 2773 
 2774         dst = kaddr;
 2775         idx = 0;
 2776         while (len > 0) {
 2777                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 2778                 len -= copyinfo[idx].len;
 2779                 dst += copyinfo[idx].len;
 2780                 idx++;
 2781         }
 2782 }
 2783 
 2784 void
 2785 vm_copyout(const void *kaddr, struct vm_copyinfo *copyinfo, size_t len)
 2786 {
 2787         const char *src;
 2788         int idx;
 2789 
 2790         src = kaddr;
 2791         idx = 0;
 2792         while (len > 0) {
 2793                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 2794                 len -= copyinfo[idx].len;
 2795                 src += copyinfo[idx].len;
 2796                 idx++;
 2797         }
 2798 }
 2799 
 2800 /*
 2801  * Return the amount of in-use and wired memory for the VM. Since
 2802  * these are global stats, only return the values with for vCPU 0
 2803  */
 2804 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 2805 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 2806 
 2807 static void
 2808 vm_get_rescnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
 2809 {
 2810 
 2811         if (vcpu->vcpuid == 0) {
 2812                 vmm_stat_set(vcpu, VMM_MEM_RESIDENT, PAGE_SIZE *
 2813                     vmspace_resident_count(vcpu->vm->vmspace));
 2814         }       
 2815 }
 2816 
 2817 static void
 2818 vm_get_wiredcnt(struct vcpu *vcpu, struct vmm_stat_type *stat)
 2819 {
 2820 
 2821         if (vcpu->vcpuid == 0) {
 2822                 vmm_stat_set(vcpu, VMM_MEM_WIRED, PAGE_SIZE *
 2823                     pmap_wired_count(vmspace_pmap(vcpu->vm->vmspace)));
 2824         }       
 2825 }
 2826 
 2827 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 2828 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
 2829 
 2830 #ifdef BHYVE_SNAPSHOT
 2831 static int
 2832 vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
 2833 {
 2834         uint64_t tsc, now;
 2835         int ret;
 2836         struct vcpu *vcpu;
 2837         uint16_t i, maxcpus;
 2838 
 2839         now = rdtsc();
 2840         maxcpus = vm_get_maxcpus(vm);
 2841         for (i = 0; i < maxcpus; i++) {
 2842                 vcpu = vm->vcpu[i];
 2843                 if (vcpu == NULL)
 2844                         continue;
 2845 
 2846                 SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
 2847                 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
 2848                 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_vector, meta, ret, done);
 2849                 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode_valid, meta, ret, done);
 2850                 SNAPSHOT_VAR_OR_LEAVE(vcpu->exc_errcode, meta, ret, done);
 2851                 SNAPSHOT_VAR_OR_LEAVE(vcpu->guest_xcr0, meta, ret, done);
 2852                 SNAPSHOT_VAR_OR_LEAVE(vcpu->exitinfo, meta, ret, done);
 2853                 SNAPSHOT_VAR_OR_LEAVE(vcpu->nextrip, meta, ret, done);
 2854 
 2855                 /*
 2856                  * Save the absolute TSC value by adding now to tsc_offset.
 2857                  *
 2858                  * It will be turned turned back into an actual offset when the
 2859                  * TSC restore function is called
 2860                  */
 2861                 tsc = now + vcpu->tsc_offset;
 2862                 SNAPSHOT_VAR_OR_LEAVE(tsc, meta, ret, done);
 2863         }
 2864 
 2865 done:
 2866         return (ret);
 2867 }
 2868 
 2869 static int
 2870 vm_snapshot_vm(struct vm *vm, struct vm_snapshot_meta *meta)
 2871 {
 2872         int ret;
 2873 
 2874         ret = vm_snapshot_vcpus(vm, meta);
 2875         if (ret != 0)
 2876                 goto done;
 2877 
 2878         SNAPSHOT_VAR_OR_LEAVE(vm->startup_cpus, meta, ret, done);
 2879 done:
 2880         return (ret);
 2881 }
 2882 
 2883 static int
 2884 vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta)
 2885 {
 2886         int error;
 2887         struct vcpu *vcpu;
 2888         uint16_t i, maxcpus;
 2889 
 2890         error = 0;
 2891 
 2892         maxcpus = vm_get_maxcpus(vm);
 2893         for (i = 0; i < maxcpus; i++) {
 2894                 vcpu = vm->vcpu[i];
 2895                 if (vcpu == NULL)
 2896                         continue;
 2897 
 2898                 error = vmmops_vcpu_snapshot(vcpu->cookie, meta);
 2899                 if (error != 0) {
 2900                         printf("%s: failed to snapshot vmcs/vmcb data for "
 2901                                "vCPU: %d; error: %d\n", __func__, i, error);
 2902                         goto done;
 2903                 }
 2904         }
 2905 
 2906 done:
 2907         return (error);
 2908 }
 2909 
 2910 /*
 2911  * Save kernel-side structures to user-space for snapshotting.
 2912  */
 2913 int
 2914 vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta)
 2915 {
 2916         int ret = 0;
 2917 
 2918         switch (meta->dev_req) {
 2919         case STRUCT_VMX:
 2920                 ret = vmmops_snapshot(vm->cookie, meta);
 2921                 break;
 2922         case STRUCT_VMCX:
 2923                 ret = vm_snapshot_vcpu(vm, meta);
 2924                 break;
 2925         case STRUCT_VM:
 2926                 ret = vm_snapshot_vm(vm, meta);
 2927                 break;
 2928         case STRUCT_VIOAPIC:
 2929                 ret = vioapic_snapshot(vm_ioapic(vm), meta);
 2930                 break;
 2931         case STRUCT_VLAPIC:
 2932                 ret = vlapic_snapshot(vm, meta);
 2933                 break;
 2934         case STRUCT_VHPET:
 2935                 ret = vhpet_snapshot(vm_hpet(vm), meta);
 2936                 break;
 2937         case STRUCT_VATPIC:
 2938                 ret = vatpic_snapshot(vm_atpic(vm), meta);
 2939                 break;
 2940         case STRUCT_VATPIT:
 2941                 ret = vatpit_snapshot(vm_atpit(vm), meta);
 2942                 break;
 2943         case STRUCT_VPMTMR:
 2944                 ret = vpmtmr_snapshot(vm_pmtmr(vm), meta);
 2945                 break;
 2946         case STRUCT_VRTC:
 2947                 ret = vrtc_snapshot(vm_rtc(vm), meta);
 2948                 break;
 2949         default:
 2950                 printf("%s: failed to find the requested type %#x\n",
 2951                        __func__, meta->dev_req);
 2952                 ret = (EINVAL);
 2953         }
 2954         return (ret);
 2955 }
 2956 
 2957 void
 2958 vm_set_tsc_offset(struct vcpu *vcpu, uint64_t offset)
 2959 {
 2960         vcpu->tsc_offset = offset;
 2961 }
 2962 
 2963 int
 2964 vm_restore_time(struct vm *vm)
 2965 {
 2966         int error;
 2967         uint64_t now;
 2968         struct vcpu *vcpu;
 2969         uint16_t i, maxcpus;
 2970 
 2971         now = rdtsc();
 2972 
 2973         error = vhpet_restore_time(vm_hpet(vm));
 2974         if (error)
 2975                 return (error);
 2976 
 2977         maxcpus = vm_get_maxcpus(vm);
 2978         for (i = 0; i < maxcpus; i++) {
 2979                 vcpu = vm->vcpu[i];
 2980                 if (vcpu == NULL)
 2981                         continue;
 2982 
 2983                 error = vmmops_restore_tsc(vcpu->cookie,
 2984                     vcpu->tsc_offset - now);
 2985                 if (error)
 2986                         return (error);
 2987         }
 2988 
 2989         return (0);
 2990 }
 2991 #endif
Cache object: a92b537df197346e12c4cf0a47febeaf
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/amd64/vmm/vmm.c

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/vmm.c