vmm.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2011 NetApp, Inc.
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  *
   26  * $FreeBSD: releng/10.1/sys/amd64/vmm/vmm.c 270159 2014-08-19 01:20:24Z grehan $
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD: releng/10.1/sys/amd64/vmm/vmm.c 270159 2014-08-19 01:20:24Z grehan $");
   31 
   32 #include <sys/param.h>
   33 #include <sys/systm.h>
   34 #include <sys/kernel.h>
   35 #include <sys/module.h>
   36 #include <sys/sysctl.h>
   37 #include <sys/malloc.h>
   38 #include <sys/pcpu.h>
   39 #include <sys/lock.h>
   40 #include <sys/mutex.h>
   41 #include <sys/proc.h>
   42 #include <sys/rwlock.h>
   43 #include <sys/sched.h>
   44 #include <sys/smp.h>
   45 #include <sys/systm.h>
   46 
   47 #include <vm/vm.h>
   48 #include <vm/vm_object.h>
   49 #include <vm/vm_page.h>
   50 #include <vm/pmap.h>
   51 #include <vm/vm_map.h>
   52 #include <vm/vm_extern.h>
   53 #include <vm/vm_param.h>
   54 
   55 #include <machine/cpu.h>
   56 #include <machine/vm.h>
   57 #include <machine/pcb.h>
   58 #include <machine/smp.h>
   59 #include <x86/psl.h>
   60 #include <x86/apicreg.h>
   61 #include <machine/vmparam.h>
   62 
   63 #include <machine/vmm.h>
   64 #include <machine/vmm_dev.h>
   65 #include <machine/vmm_instruction_emul.h>
   66 
   67 #include "vmm_ioport.h"
   68 #include "vmm_ktr.h"
   69 #include "vmm_host.h"
   70 #include "vmm_mem.h"
   71 #include "vmm_util.h"
   72 #include "vatpic.h"
   73 #include "vatpit.h"
   74 #include "vhpet.h"
   75 #include "vioapic.h"
   76 #include "vlapic.h"
   77 #include "vmm_msr.h"
   78 #include "vmm_ipi.h"
   79 #include "vmm_stat.h"
   80 #include "vmm_lapic.h"
   81 
   82 #include "io/ppt.h"
   83 #include "io/iommu.h"
   84 
   85 struct vlapic;
   86 
   87 /*
   88  * Initialization:
   89  * (a) allocated when vcpu is created
   90  * (i) initialized when vcpu is created and when it is reinitialized
   91  * (o) initialized the first time the vcpu is created
   92  * (x) initialized before use
   93  */
   94 struct vcpu {
   95         struct mtx      mtx;            /* (o) protects 'state' and 'hostcpu' */
   96         enum vcpu_state state;          /* (o) vcpu state */
   97         int             hostcpu;        /* (o) vcpu's host cpu */
   98         struct vlapic   *vlapic;        /* (i) APIC device model */
   99         enum x2apic_state x2apic_state; /* (i) APIC mode */
  100         uint64_t        exitintinfo;    /* (i) events pending at VM exit */
  101         int             nmi_pending;    /* (i) NMI pending */
  102         int             extint_pending; /* (i) INTR pending */
  103         struct vm_exception exception;  /* (x) exception collateral */
  104         int     exception_pending;      /* (i) exception pending */
  105         struct savefpu  *guestfpu;      /* (a,i) guest fpu state */
  106         uint64_t        guest_xcr0;     /* (i) guest %xcr0 register */
  107         void            *stats;         /* (a,i) statistics */
  108         uint64_t guest_msrs[VMM_MSR_NUM]; /* (i) emulated MSRs */
  109         struct vm_exit  exitinfo;       /* (x) exit reason and collateral */
  110 };
  111 
  112 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
  113 #define vcpu_lock_init(v)       mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
  114 #define vcpu_lock(v)            mtx_lock_spin(&((v)->mtx))
  115 #define vcpu_unlock(v)          mtx_unlock_spin(&((v)->mtx))
  116 #define vcpu_assert_locked(v)   mtx_assert(&((v)->mtx), MA_OWNED)
  117 
  118 struct mem_seg {
  119         vm_paddr_t      gpa;
  120         size_t          len;
  121         boolean_t       wired;
  122         vm_object_t     object;
  123 };
  124 #define VM_MAX_MEMORY_SEGMENTS  2
  125 
  126 /*
  127  * Initialization:
  128  * (o) initialized the first time the VM is created
  129  * (i) initialized when VM is created and when it is reinitialized
  130  * (x) initialized before use
  131  */
  132 struct vm {
  133         void            *cookie;                /* (i) cpu-specific data */
  134         void            *iommu;                 /* (x) iommu-specific data */
  135         struct vhpet    *vhpet;                 /* (i) virtual HPET */
  136         struct vioapic  *vioapic;               /* (i) virtual ioapic */
  137         struct vatpic   *vatpic;                /* (i) virtual atpic */
  138         struct vatpit   *vatpit;                /* (i) virtual atpit */
  139         volatile cpuset_t active_cpus;          /* (i) active vcpus */
  140         int             suspend;                /* (i) stop VM execution */
  141         volatile cpuset_t suspended_cpus;       /* (i) suspended vcpus */
  142         volatile cpuset_t halted_cpus;          /* (x) cpus in a hard halt */
  143         cpuset_t        rendezvous_req_cpus;    /* (x) rendezvous requested */
  144         cpuset_t        rendezvous_done_cpus;   /* (x) rendezvous finished */
  145         void            *rendezvous_arg;        /* (x) rendezvous func/arg */
  146         vm_rendezvous_func_t rendezvous_func;
  147         struct mtx      rendezvous_mtx;         /* (o) rendezvous lock */
  148         int             num_mem_segs;           /* (o) guest memory segments */
  149         struct mem_seg  mem_segs[VM_MAX_MEMORY_SEGMENTS];
  150         struct vmspace  *vmspace;               /* (o) guest's address space */
  151         char            name[VM_MAX_NAMELEN];   /* (o) virtual machine name */
  152         struct vcpu     vcpu[VM_MAXCPU];        /* (i) guest vcpus */
  153 };
  154 
  155 static int vmm_initialized;
  156 
  157 static struct vmm_ops *ops;
  158 #define VMM_INIT(num)   (ops != NULL ? (*ops->init)(num) : 0)
  159 #define VMM_CLEANUP()   (ops != NULL ? (*ops->cleanup)() : 0)
  160 #define VMM_RESUME()    (ops != NULL ? (*ops->resume)() : 0)
  161 
  162 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
  163 #define VMRUN(vmi, vcpu, rip, pmap, rptr, sptr) \
  164         (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, rptr, sptr) : ENXIO)
  165 #define VMCLEANUP(vmi)  (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
  166 #define VMSPACE_ALLOC(min, max) \
  167         (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
  168 #define VMSPACE_FREE(vmspace) \
  169         (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
  170 #define VMGETREG(vmi, vcpu, num, retval)                \
  171         (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
  172 #define VMSETREG(vmi, vcpu, num, val)           \
  173         (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
  174 #define VMGETDESC(vmi, vcpu, num, desc)         \
  175         (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
  176 #define VMSETDESC(vmi, vcpu, num, desc)         \
  177         (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
  178 #define VMGETCAP(vmi, vcpu, num, retval)        \
  179         (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
  180 #define VMSETCAP(vmi, vcpu, num, val)           \
  181         (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
  182 #define VLAPIC_INIT(vmi, vcpu)                  \
  183         (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
  184 #define VLAPIC_CLEANUP(vmi, vlapic)             \
  185         (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
  186 
  187 #define fpu_start_emulating()   load_cr0(rcr0() | CR0_TS)
  188 #define fpu_stop_emulating()    clts()
  189 
  190 static MALLOC_DEFINE(M_VM, "vm", "vm");
  191 CTASSERT(VMM_MSR_NUM <= 64);    /* msr_mask can keep track of up to 64 msrs */
  192 
  193 /* statistics */
  194 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
  195 
  196 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
  197 
  198 /*
  199  * Halt the guest if all vcpus are executing a HLT instruction with
  200  * interrupts disabled.
  201  */
  202 static int halt_detection_enabled = 1;
  203 TUNABLE_INT("hw.vmm.halt_detection", &halt_detection_enabled);
  204 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
  205     &halt_detection_enabled, 0,
  206     "Halt VM if all vcpus execute HLT with interrupts disabled");
  207 
  208 static int vmm_ipinum;
  209 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
  210     "IPI vector used for vcpu notifications");
  211 
  212 static void
  213 vcpu_cleanup(struct vm *vm, int i, bool destroy)
  214 {
  215         struct vcpu *vcpu = &vm->vcpu[i];
  216 
  217         VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
  218         if (destroy) {
  219                 vmm_stat_free(vcpu->stats);     
  220                 fpu_save_area_free(vcpu->guestfpu);
  221         }
  222 }
  223 
  224 static void
  225 vcpu_init(struct vm *vm, int vcpu_id, bool create)
  226 {
  227         struct vcpu *vcpu;
  228 
  229         KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
  230             ("vcpu_init: invalid vcpu %d", vcpu_id));
  231           
  232         vcpu = &vm->vcpu[vcpu_id];
  233 
  234         if (create) {
  235                 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
  236                     "initialized", vcpu_id));
  237                 vcpu_lock_init(vcpu);
  238                 vcpu->state = VCPU_IDLE;
  239                 vcpu->hostcpu = NOCPU;
  240                 vcpu->guestfpu = fpu_save_area_alloc();
  241                 vcpu->stats = vmm_stat_alloc();
  242         }
  243 
  244         vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
  245         vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
  246         vcpu->exitintinfo = 0;
  247         vcpu->nmi_pending = 0;
  248         vcpu->extint_pending = 0;
  249         vcpu->exception_pending = 0;
  250         vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
  251         fpu_save_area_reset(vcpu->guestfpu);
  252         vmm_stat_init(vcpu->stats);
  253         guest_msrs_init(vm, vcpu_id);
  254 }
  255 
  256 struct vm_exit *
  257 vm_exitinfo(struct vm *vm, int cpuid)
  258 {
  259         struct vcpu *vcpu;
  260 
  261         if (cpuid < 0 || cpuid >= VM_MAXCPU)
  262                 panic("vm_exitinfo: invalid cpuid %d", cpuid);
  263 
  264         vcpu = &vm->vcpu[cpuid];
  265 
  266         return (&vcpu->exitinfo);
  267 }
  268 
  269 static void
  270 vmm_resume(void)
  271 {
  272         VMM_RESUME();
  273 }
  274 
  275 static int
  276 vmm_init(void)
  277 {
  278         int error;
  279 
  280         vmm_host_state_init();
  281 
  282         vmm_ipinum = vmm_ipi_alloc();
  283         if (vmm_ipinum == 0)
  284                 vmm_ipinum = IPI_AST;
  285 
  286         error = vmm_mem_init();
  287         if (error)
  288                 return (error);
  289         
  290         if (vmm_is_intel())
  291                 ops = &vmm_ops_intel;
  292         else if (vmm_is_amd())
  293                 ops = &vmm_ops_amd;
  294         else
  295                 return (ENXIO);
  296 
  297         vmm_msr_init();
  298         vmm_resume_p = vmm_resume;
  299 
  300         return (VMM_INIT(vmm_ipinum));
  301 }
  302 
  303 static int
  304 vmm_handler(module_t mod, int what, void *arg)
  305 {
  306         int error;
  307 
  308         switch (what) {
  309         case MOD_LOAD:
  310                 vmmdev_init();
  311                 if (ppt_avail_devices() > 0)
  312                         iommu_init();
  313                 error = vmm_init();
  314                 if (error == 0)
  315                         vmm_initialized = 1;
  316                 break;
  317         case MOD_UNLOAD:
  318                 error = vmmdev_cleanup();
  319                 if (error == 0) {
  320                         vmm_resume_p = NULL;
  321                         iommu_cleanup();
  322                         if (vmm_ipinum != IPI_AST)
  323                                 vmm_ipi_free(vmm_ipinum);
  324                         error = VMM_CLEANUP();
  325                         /*
  326                          * Something bad happened - prevent new
  327                          * VMs from being created
  328                          */
  329                         if (error)
  330                                 vmm_initialized = 0;
  331                 }
  332                 break;
  333         default:
  334                 error = 0;
  335                 break;
  336         }
  337         return (error);
  338 }
  339 
  340 static moduledata_t vmm_kmod = {
  341         "vmm",
  342         vmm_handler,
  343         NULL
  344 };
  345 
  346 /*
  347  * vmm initialization has the following dependencies:
  348  *
  349  * - iommu initialization must happen after the pci passthru driver has had
  350  *   a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
  351  *
  352  * - VT-x initialization requires smp_rendezvous() and therefore must happen
  353  *   after SMP is fully functional (after SI_SUB_SMP).
  354  */
  355 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
  356 MODULE_VERSION(vmm, 1);
  357 
  358 static void
  359 vm_init(struct vm *vm, bool create)
  360 {
  361         int i;
  362 
  363         vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
  364         vm->iommu = NULL;
  365         vm->vioapic = vioapic_init(vm);
  366         vm->vhpet = vhpet_init(vm);
  367         vm->vatpic = vatpic_init(vm);
  368         vm->vatpit = vatpit_init(vm);
  369 
  370         CPU_ZERO(&vm->active_cpus);
  371 
  372         vm->suspend = 0;
  373         CPU_ZERO(&vm->suspended_cpus);
  374 
  375         for (i = 0; i < VM_MAXCPU; i++)
  376                 vcpu_init(vm, i, create);
  377 }
  378 
  379 int
  380 vm_create(const char *name, struct vm **retvm)
  381 {
  382         struct vm *vm;
  383         struct vmspace *vmspace;
  384 
  385         /*
  386          * If vmm.ko could not be successfully initialized then don't attempt
  387          * to create the virtual machine.
  388          */
  389         if (!vmm_initialized)
  390                 return (ENXIO);
  391 
  392         if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
  393                 return (EINVAL);
  394 
  395         vmspace = VMSPACE_ALLOC(VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
  396         if (vmspace == NULL)
  397                 return (ENOMEM);
  398 
  399         vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
  400         strcpy(vm->name, name);
  401         vm->num_mem_segs = 0;
  402         vm->vmspace = vmspace;
  403         mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
  404 
  405         vm_init(vm, true);
  406 
  407         *retvm = vm;
  408         return (0);
  409 }
  410 
  411 static void
  412 vm_free_mem_seg(struct vm *vm, struct mem_seg *seg)
  413 {
  414 
  415         if (seg->object != NULL)
  416                 vmm_mem_free(vm->vmspace, seg->gpa, seg->len);
  417 
  418         bzero(seg, sizeof(*seg));
  419 }
  420 
  421 static void
  422 vm_cleanup(struct vm *vm, bool destroy)
  423 {
  424         int i;
  425 
  426         ppt_unassign_all(vm);
  427 
  428         if (vm->iommu != NULL)
  429                 iommu_destroy_domain(vm->iommu);
  430 
  431         vatpit_cleanup(vm->vatpit);
  432         vhpet_cleanup(vm->vhpet);
  433         vatpic_cleanup(vm->vatpic);
  434         vioapic_cleanup(vm->vioapic);
  435 
  436         for (i = 0; i < VM_MAXCPU; i++)
  437                 vcpu_cleanup(vm, i, destroy);
  438 
  439         VMCLEANUP(vm->cookie);
  440 
  441         if (destroy) {
  442                 for (i = 0; i < vm->num_mem_segs; i++)
  443                         vm_free_mem_seg(vm, &vm->mem_segs[i]);
  444 
  445                 vm->num_mem_segs = 0;
  446 
  447                 VMSPACE_FREE(vm->vmspace);
  448                 vm->vmspace = NULL;
  449         }
  450 }
  451 
  452 void
  453 vm_destroy(struct vm *vm)
  454 {
  455         vm_cleanup(vm, true);
  456         free(vm, M_VM);
  457 }
  458 
  459 int
  460 vm_reinit(struct vm *vm)
  461 {
  462         int error;
  463 
  464         /*
  465          * A virtual machine can be reset only if all vcpus are suspended.
  466          */
  467         if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
  468                 vm_cleanup(vm, false);
  469                 vm_init(vm, false);
  470                 error = 0;
  471         } else {
  472                 error = EBUSY;
  473         }
  474 
  475         return (error);
  476 }
  477 
  478 const char *
  479 vm_name(struct vm *vm)
  480 {
  481         return (vm->name);
  482 }
  483 
  484 int
  485 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
  486 {
  487         vm_object_t obj;
  488 
  489         if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
  490                 return (ENOMEM);
  491         else
  492                 return (0);
  493 }
  494 
  495 int
  496 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
  497 {
  498 
  499         vmm_mmio_free(vm->vmspace, gpa, len);
  500         return (0);
  501 }
  502 
  503 boolean_t
  504 vm_mem_allocated(struct vm *vm, vm_paddr_t gpa)
  505 {
  506         int i;
  507         vm_paddr_t gpabase, gpalimit;
  508 
  509         for (i = 0; i < vm->num_mem_segs; i++) {
  510                 gpabase = vm->mem_segs[i].gpa;
  511                 gpalimit = gpabase + vm->mem_segs[i].len;
  512                 if (gpa >= gpabase && gpa < gpalimit)
  513                         return (TRUE);          /* 'gpa' is regular memory */
  514         }
  515 
  516         if (ppt_is_mmio(vm, gpa))
  517                 return (TRUE);                  /* 'gpa' is pci passthru mmio */
  518 
  519         return (FALSE);
  520 }
  521 
  522 int
  523 vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
  524 {
  525         int available, allocated;
  526         struct mem_seg *seg;
  527         vm_object_t object;
  528         vm_paddr_t g;
  529 
  530         if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
  531                 return (EINVAL);
  532         
  533         available = allocated = 0;
  534         g = gpa;
  535         while (g < gpa + len) {
  536                 if (vm_mem_allocated(vm, g))
  537                         allocated++;
  538                 else
  539                         available++;
  540 
  541                 g += PAGE_SIZE;
  542         }
  543 
  544         /*
  545          * If there are some allocated and some available pages in the address
  546          * range then it is an error.
  547          */
  548         if (allocated && available)
  549                 return (EINVAL);
  550 
  551         /*
  552          * If the entire address range being requested has already been
  553          * allocated then there isn't anything more to do.
  554          */
  555         if (allocated && available == 0)
  556                 return (0);
  557 
  558         if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
  559                 return (E2BIG);
  560 
  561         seg = &vm->mem_segs[vm->num_mem_segs];
  562 
  563         if ((object = vmm_mem_alloc(vm->vmspace, gpa, len)) == NULL)
  564                 return (ENOMEM);
  565 
  566         seg->gpa = gpa;
  567         seg->len = len;
  568         seg->object = object;
  569         seg->wired = FALSE;
  570 
  571         vm->num_mem_segs++;
  572 
  573         return (0);
  574 }
  575 
  576 static vm_paddr_t
  577 vm_maxmem(struct vm *vm)
  578 {
  579         int i;
  580         vm_paddr_t gpa, maxmem;
  581 
  582         maxmem = 0;
  583         for (i = 0; i < vm->num_mem_segs; i++) {
  584                 gpa = vm->mem_segs[i].gpa + vm->mem_segs[i].len;
  585                 if (gpa > maxmem)
  586                         maxmem = gpa;
  587         }
  588         return (maxmem);
  589 }
  590 
  591 static void
  592 vm_gpa_unwire(struct vm *vm)
  593 {
  594         int i, rv;
  595         struct mem_seg *seg;
  596 
  597         for (i = 0; i < vm->num_mem_segs; i++) {
  598                 seg = &vm->mem_segs[i];
  599                 if (!seg->wired)
  600                         continue;
  601 
  602                 rv = vm_map_unwire(&vm->vmspace->vm_map,
  603                                    seg->gpa, seg->gpa + seg->len,
  604                                    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
  605                 KASSERT(rv == KERN_SUCCESS, ("vm(%s) memory segment "
  606                     "%#lx/%ld could not be unwired: %d",
  607                     vm_name(vm), seg->gpa, seg->len, rv));
  608 
  609                 seg->wired = FALSE;
  610         }
  611 }
  612 
  613 static int
  614 vm_gpa_wire(struct vm *vm)
  615 {
  616         int i, rv;
  617         struct mem_seg *seg;
  618 
  619         for (i = 0; i < vm->num_mem_segs; i++) {
  620                 seg = &vm->mem_segs[i];
  621                 if (seg->wired)
  622                         continue;
  623 
  624                 /* XXX rlimits? */
  625                 rv = vm_map_wire(&vm->vmspace->vm_map,
  626                                  seg->gpa, seg->gpa + seg->len,
  627                                  VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
  628                 if (rv != KERN_SUCCESS)
  629                         break;
  630 
  631                 seg->wired = TRUE;
  632         }
  633 
  634         if (i < vm->num_mem_segs) {
  635                 /*
  636                  * Undo the wiring before returning an error.
  637                  */
  638                 vm_gpa_unwire(vm);
  639                 return (EAGAIN);
  640         }
  641 
  642         return (0);
  643 }
  644 
  645 static void
  646 vm_iommu_modify(struct vm *vm, boolean_t map)
  647 {
  648         int i, sz;
  649         vm_paddr_t gpa, hpa;
  650         struct mem_seg *seg;
  651         void *vp, *cookie, *host_domain;
  652 
  653         sz = PAGE_SIZE;
  654         host_domain = iommu_host_domain();
  655 
  656         for (i = 0; i < vm->num_mem_segs; i++) {
  657                 seg = &vm->mem_segs[i];
  658                 KASSERT(seg->wired, ("vm(%s) memory segment %#lx/%ld not wired",
  659                     vm_name(vm), seg->gpa, seg->len));
  660 
  661                 gpa = seg->gpa;
  662                 while (gpa < seg->gpa + seg->len) {
  663                         vp = vm_gpa_hold(vm, gpa, PAGE_SIZE, VM_PROT_WRITE,
  664                                          &cookie);
  665                         KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
  666                             vm_name(vm), gpa));
  667 
  668                         vm_gpa_release(cookie);
  669 
  670                         hpa = DMAP_TO_PHYS((uintptr_t)vp);
  671                         if (map) {
  672                                 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
  673                                 iommu_remove_mapping(host_domain, hpa, sz);
  674                         } else {
  675                                 iommu_remove_mapping(vm->iommu, gpa, sz);
  676                                 iommu_create_mapping(host_domain, hpa, hpa, sz);
  677                         }
  678 
  679                         gpa += PAGE_SIZE;
  680                 }
  681         }
  682 
  683         /*
  684          * Invalidate the cached translations associated with the domain
  685          * from which pages were removed.
  686          */
  687         if (map)
  688                 iommu_invalidate_tlb(host_domain);
  689         else
  690                 iommu_invalidate_tlb(vm->iommu);
  691 }
  692 
  693 #define vm_iommu_unmap(vm)      vm_iommu_modify((vm), FALSE)
  694 #define vm_iommu_map(vm)        vm_iommu_modify((vm), TRUE)
  695 
  696 int
  697 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
  698 {
  699         int error;
  700 
  701         error = ppt_unassign_device(vm, bus, slot, func);
  702         if (error)
  703                 return (error);
  704 
  705         if (ppt_assigned_devices(vm) == 0) {
  706                 vm_iommu_unmap(vm);
  707                 vm_gpa_unwire(vm);
  708         }
  709         return (0);
  710 }
  711 
  712 int
  713 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
  714 {
  715         int error;
  716         vm_paddr_t maxaddr;
  717 
  718         /*
  719          * Virtual machines with pci passthru devices get special treatment:
  720          * - the guest physical memory is wired
  721          * - the iommu is programmed to do the 'gpa' to 'hpa' translation
  722          *
  723          * We need to do this before the first pci passthru device is attached.
  724          */
  725         if (ppt_assigned_devices(vm) == 0) {
  726                 KASSERT(vm->iommu == NULL,
  727                     ("vm_assign_pptdev: iommu must be NULL"));
  728                 maxaddr = vm_maxmem(vm);
  729                 vm->iommu = iommu_create_domain(maxaddr);
  730 
  731                 error = vm_gpa_wire(vm);
  732                 if (error)
  733                         return (error);
  734 
  735                 vm_iommu_map(vm);
  736         }
  737 
  738         error = ppt_assign_device(vm, bus, slot, func);
  739         return (error);
  740 }
  741 
  742 void *
  743 vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
  744             void **cookie)
  745 {
  746         int count, pageoff;
  747         vm_page_t m;
  748 
  749         pageoff = gpa & PAGE_MASK;
  750         if (len > PAGE_SIZE - pageoff)
  751                 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
  752 
  753         count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
  754             trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
  755 
  756         if (count == 1) {
  757                 *cookie = m;
  758                 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
  759         } else {
  760                 *cookie = NULL;
  761                 return (NULL);
  762         }
  763 }
  764 
  765 void
  766 vm_gpa_release(void *cookie)
  767 {
  768         vm_page_t m = cookie;
  769 
  770         vm_page_lock(m);
  771         vm_page_unhold(m);
  772         vm_page_unlock(m);
  773 }
  774 
  775 int
  776 vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
  777                   struct vm_memory_segment *seg)
  778 {
  779         int i;
  780 
  781         for (i = 0; i < vm->num_mem_segs; i++) {
  782                 if (gpabase == vm->mem_segs[i].gpa) {
  783                         seg->gpa = vm->mem_segs[i].gpa;
  784                         seg->len = vm->mem_segs[i].len;
  785                         seg->wired = vm->mem_segs[i].wired;
  786                         return (0);
  787                 }
  788         }
  789         return (-1);
  790 }
  791 
  792 int
  793 vm_get_memobj(struct vm *vm, vm_paddr_t gpa, size_t len,
  794               vm_offset_t *offset, struct vm_object **object)
  795 {
  796         int i;
  797         size_t seg_len;
  798         vm_paddr_t seg_gpa;
  799         vm_object_t seg_obj;
  800 
  801         for (i = 0; i < vm->num_mem_segs; i++) {
  802                 if ((seg_obj = vm->mem_segs[i].object) == NULL)
  803                         continue;
  804 
  805                 seg_gpa = vm->mem_segs[i].gpa;
  806                 seg_len = vm->mem_segs[i].len;
  807 
  808                 if (gpa >= seg_gpa && gpa < seg_gpa + seg_len) {
  809                         *offset = gpa - seg_gpa;
  810                         *object = seg_obj;
  811                         vm_object_reference(seg_obj);
  812                         return (0);
  813                 }
  814         }
  815 
  816         return (EINVAL);
  817 }
  818 
  819 int
  820 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
  821 {
  822 
  823         if (vcpu < 0 || vcpu >= VM_MAXCPU)
  824                 return (EINVAL);
  825 
  826         if (reg >= VM_REG_LAST)
  827                 return (EINVAL);
  828 
  829         return (VMGETREG(vm->cookie, vcpu, reg, retval));
  830 }
  831 
  832 int
  833 vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val)
  834 {
  835 
  836         if (vcpu < 0 || vcpu >= VM_MAXCPU)
  837                 return (EINVAL);
  838 
  839         if (reg >= VM_REG_LAST)
  840                 return (EINVAL);
  841 
  842         return (VMSETREG(vm->cookie, vcpu, reg, val));
  843 }
  844 
  845 static boolean_t
  846 is_descriptor_table(int reg)
  847 {
  848 
  849         switch (reg) {
  850         case VM_REG_GUEST_IDTR:
  851         case VM_REG_GUEST_GDTR:
  852                 return (TRUE);
  853         default:
  854                 return (FALSE);
  855         }
  856 }
  857 
  858 static boolean_t
  859 is_segment_register(int reg)
  860 {
  861         
  862         switch (reg) {
  863         case VM_REG_GUEST_ES:
  864         case VM_REG_GUEST_CS:
  865         case VM_REG_GUEST_SS:
  866         case VM_REG_GUEST_DS:
  867         case VM_REG_GUEST_FS:
  868         case VM_REG_GUEST_GS:
  869         case VM_REG_GUEST_TR:
  870         case VM_REG_GUEST_LDTR:
  871                 return (TRUE);
  872         default:
  873                 return (FALSE);
  874         }
  875 }
  876 
  877 int
  878 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
  879                 struct seg_desc *desc)
  880 {
  881 
  882         if (vcpu < 0 || vcpu >= VM_MAXCPU)
  883                 return (EINVAL);
  884 
  885         if (!is_segment_register(reg) && !is_descriptor_table(reg))
  886                 return (EINVAL);
  887 
  888         return (VMGETDESC(vm->cookie, vcpu, reg, desc));
  889 }
  890 
  891 int
  892 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
  893                 struct seg_desc *desc)
  894 {
  895         if (vcpu < 0 || vcpu >= VM_MAXCPU)
  896                 return (EINVAL);
  897 
  898         if (!is_segment_register(reg) && !is_descriptor_table(reg))
  899                 return (EINVAL);
  900 
  901         return (VMSETDESC(vm->cookie, vcpu, reg, desc));
  902 }
  903 
  904 static void
  905 restore_guest_fpustate(struct vcpu *vcpu)
  906 {
  907 
  908         /* flush host state to the pcb */
  909         fpuexit(curthread);
  910 
  911         /* restore guest FPU state */
  912         fpu_stop_emulating();
  913         fpurestore(vcpu->guestfpu);
  914 
  915         /* restore guest XCR0 if XSAVE is enabled in the host */
  916         if (rcr4() & CR4_XSAVE)
  917                 load_xcr(0, vcpu->guest_xcr0);
  918 
  919         /*
  920          * The FPU is now "dirty" with the guest's state so turn on emulation
  921          * to trap any access to the FPU by the host.
  922          */
  923         fpu_start_emulating();
  924 }
  925 
  926 static void
  927 save_guest_fpustate(struct vcpu *vcpu)
  928 {
  929 
  930         if ((rcr0() & CR0_TS) == 0)
  931                 panic("fpu emulation not enabled in host!");
  932 
  933         /* save guest XCR0 and restore host XCR0 */
  934         if (rcr4() & CR4_XSAVE) {
  935                 vcpu->guest_xcr0 = rxcr(0);
  936                 load_xcr(0, vmm_get_host_xcr0());
  937         }
  938 
  939         /* save guest FPU state */
  940         fpu_stop_emulating();
  941         fpusave(vcpu->guestfpu);
  942         fpu_start_emulating();
  943 }
  944 
  945 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
  946 
  947 static int
  948 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
  949     bool from_idle)
  950 {
  951         int error;
  952 
  953         vcpu_assert_locked(vcpu);
  954 
  955         /*
  956          * State transitions from the vmmdev_ioctl() must always begin from
  957          * the VCPU_IDLE state. This guarantees that there is only a single
  958          * ioctl() operating on a vcpu at any point.
  959          */
  960         if (from_idle) {
  961                 while (vcpu->state != VCPU_IDLE)
  962                         msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
  963         } else {
  964                 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
  965                     "vcpu idle state"));
  966         }
  967 
  968         if (vcpu->state == VCPU_RUNNING) {
  969                 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
  970                     "mismatch for running vcpu", curcpu, vcpu->hostcpu));
  971         } else {
  972                 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
  973                     "vcpu that is not running", vcpu->hostcpu));
  974         }
  975 
  976         /*
  977          * The following state transitions are allowed:
  978          * IDLE -> FROZEN -> IDLE
  979          * FROZEN -> RUNNING -> FROZEN
  980          * FROZEN -> SLEEPING -> FROZEN
  981          */
  982         switch (vcpu->state) {
  983         case VCPU_IDLE:
  984         case VCPU_RUNNING:
  985         case VCPU_SLEEPING:
  986                 error = (newstate != VCPU_FROZEN);
  987                 break;
  988         case VCPU_FROZEN:
  989                 error = (newstate == VCPU_FROZEN);
  990                 break;
  991         default:
  992                 error = 1;
  993                 break;
  994         }
  995 
  996         if (error)
  997                 return (EBUSY);
  998 
  999         vcpu->state = newstate;
 1000         if (newstate == VCPU_RUNNING)
 1001                 vcpu->hostcpu = curcpu;
 1002         else
 1003                 vcpu->hostcpu = NOCPU;
 1004 
 1005         if (newstate == VCPU_IDLE)
 1006                 wakeup(&vcpu->state);
 1007 
 1008         return (0);
 1009 }
 1010 
 1011 static void
 1012 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
 1013 {
 1014         int error;
 1015 
 1016         if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
 1017                 panic("Error %d setting state to %d\n", error, newstate);
 1018 }
 1019 
 1020 static void
 1021 vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 1022 {
 1023         int error;
 1024 
 1025         if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
 1026                 panic("Error %d setting state to %d", error, newstate);
 1027 }
 1028 
 1029 static void
 1030 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
 1031 {
 1032 
 1033         KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
 1034 
 1035         /*
 1036          * Update 'rendezvous_func' and execute a write memory barrier to
 1037          * ensure that it is visible across all host cpus. This is not needed
 1038          * for correctness but it does ensure that all the vcpus will notice
 1039          * that the rendezvous is requested immediately.
 1040          */
 1041         vm->rendezvous_func = func;
 1042         wmb();
 1043 }
 1044 
 1045 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt)                                \
 1046         do {                                                            \
 1047                 if (vcpuid >= 0)                                        \
 1048                         VCPU_CTR0(vm, vcpuid, fmt);                     \
 1049                 else                                                    \
 1050                         VM_CTR0(vm, fmt);                               \
 1051         } while (0)
 1052 
 1053 static void
 1054 vm_handle_rendezvous(struct vm *vm, int vcpuid)
 1055 {
 1056 
 1057         KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 1058             ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
 1059 
 1060         mtx_lock(&vm->rendezvous_mtx);
 1061         while (vm->rendezvous_func != NULL) {
 1062                 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
 1063                 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
 1064 
 1065                 if (vcpuid != -1 &&
 1066                     CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
 1067                     !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
 1068                         VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
 1069                         (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
 1070                         CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
 1071                 }
 1072                 if (CPU_CMP(&vm->rendezvous_req_cpus,
 1073                     &vm->rendezvous_done_cpus) == 0) {
 1074                         VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
 1075                         vm_set_rendezvous_func(vm, NULL);
 1076                         wakeup(&vm->rendezvous_func);
 1077                         break;
 1078                 }
 1079                 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
 1080                 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 1081                     "vmrndv", 0);
 1082         }
 1083         mtx_unlock(&vm->rendezvous_mtx);
 1084 }
 1085 
 1086 /*
 1087  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
 1088  */
 1089 static int
 1090 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
 1091 {
 1092         struct vcpu *vcpu;
 1093         const char *wmesg;
 1094         int t, vcpu_halted, vm_halted;
 1095 
 1096         KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 1097 
 1098         vcpu = &vm->vcpu[vcpuid];
 1099         vcpu_halted = 0;
 1100         vm_halted = 0;
 1101 
 1102         vcpu_lock(vcpu);
 1103         while (1) {
 1104                 /*
 1105                  * Do a final check for pending NMI or interrupts before
 1106                  * really putting this thread to sleep. Also check for
 1107                  * software events that would cause this vcpu to wakeup.
 1108                  *
 1109                  * These interrupts/events could have happened after the
 1110                  * vcpu returned from VMRUN() and before it acquired the
 1111                  * vcpu lock above.
 1112                  */
 1113                 if (vm->rendezvous_func != NULL || vm->suspend)
 1114                         break;
 1115                 if (vm_nmi_pending(vm, vcpuid))
 1116                         break;
 1117                 if (!intr_disabled) {
 1118                         if (vm_extint_pending(vm, vcpuid) ||
 1119                             vlapic_pending_intr(vcpu->vlapic, NULL)) {
 1120                                 break;
 1121                         }
 1122                 }
 1123 
 1124                 /* Don't go to sleep if the vcpu thread needs to yield */
 1125                 if (vcpu_should_yield(vm, vcpuid))
 1126                         break;
 1127 
 1128                 /*
 1129                  * Some Linux guests implement "halt" by having all vcpus
 1130                  * execute HLT with interrupts disabled. 'halted_cpus' keeps
 1131                  * track of the vcpus that have entered this state. When all
 1132                  * vcpus enter the halted state the virtual machine is halted.
 1133                  */
 1134                 if (intr_disabled) {
 1135                         wmesg = "vmhalt";
 1136                         VCPU_CTR0(vm, vcpuid, "Halted");
 1137                         if (!vcpu_halted && halt_detection_enabled) {
 1138                                 vcpu_halted = 1;
 1139                                 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
 1140                         }
 1141                         if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
 1142                                 vm_halted = 1;
 1143                                 break;
 1144                         }
 1145                 } else {
 1146                         wmesg = "vmidle";
 1147                 }
 1148 
 1149                 t = ticks;
 1150                 vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 1151                 /*
 1152                  * XXX msleep_spin() cannot be interrupted by signals so
 1153                  * wake up periodically to check pending signals.
 1154                  */
 1155                 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
 1156                 vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 1157                 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 1158         }
 1159 
 1160         if (vcpu_halted)
 1161                 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
 1162 
 1163         vcpu_unlock(vcpu);
 1164 
 1165         if (vm_halted)
 1166                 vm_suspend(vm, VM_SUSPEND_HALT);
 1167 
 1168         return (0);
 1169 }
 1170 
 1171 static int
 1172 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
 1173 {
 1174         int rv, ftype;
 1175         struct vm_map *map;
 1176         struct vcpu *vcpu;
 1177         struct vm_exit *vme;
 1178 
 1179         vcpu = &vm->vcpu[vcpuid];
 1180         vme = &vcpu->exitinfo;
 1181 
 1182         ftype = vme->u.paging.fault_type;
 1183         KASSERT(ftype == VM_PROT_READ ||
 1184             ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
 1185             ("vm_handle_paging: invalid fault_type %d", ftype));
 1186 
 1187         if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
 1188                 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
 1189                     vme->u.paging.gpa, ftype);
 1190                 if (rv == 0)
 1191                         goto done;
 1192         }
 1193 
 1194         map = &vm->vmspace->vm_map;
 1195         rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
 1196 
 1197         VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
 1198             "ftype = %d", rv, vme->u.paging.gpa, ftype);
 1199 
 1200         if (rv != KERN_SUCCESS)
 1201                 return (EFAULT);
 1202 done:
 1203         /* restart execution at the faulting instruction */
 1204         vme->inst_length = 0;
 1205 
 1206         return (0);
 1207 }
 1208 
 1209 static int
 1210 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
 1211 {
 1212         struct vie *vie;
 1213         struct vcpu *vcpu;
 1214         struct vm_exit *vme;
 1215         uint64_t gla, gpa;
 1216         struct vm_guest_paging *paging;
 1217         mem_region_read_t mread;
 1218         mem_region_write_t mwrite;
 1219         enum vm_cpu_mode cpu_mode;
 1220         int cs_d, error;
 1221 
 1222         vcpu = &vm->vcpu[vcpuid];
 1223         vme = &vcpu->exitinfo;
 1224 
 1225         gla = vme->u.inst_emul.gla;
 1226         gpa = vme->u.inst_emul.gpa;
 1227         cs_d = vme->u.inst_emul.cs_d;
 1228         vie = &vme->u.inst_emul.vie;
 1229         paging = &vme->u.inst_emul.paging;
 1230         cpu_mode = paging->cpu_mode;
 1231 
 1232         vie_init(vie);
 1233 
 1234         /* Fetch, decode and emulate the faulting instruction */
 1235         error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip,
 1236             vme->inst_length, vie);
 1237         if (error == 1)
 1238                 return (0);             /* Resume guest to handle page fault */
 1239         else if (error == -1)
 1240                 return (EFAULT);
 1241         else if (error != 0)
 1242                 panic("%s: vmm_fetch_instruction error %d", __func__, error);
 1243 
 1244         if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
 1245                 return (EFAULT);
 1246 
 1247         /* return to userland unless this is an in-kernel emulated device */
 1248         if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
 1249                 mread = lapic_mmio_read;
 1250                 mwrite = lapic_mmio_write;
 1251         } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
 1252                 mread = vioapic_mmio_read;
 1253                 mwrite = vioapic_mmio_write;
 1254         } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
 1255                 mread = vhpet_mmio_read;
 1256                 mwrite = vhpet_mmio_write;
 1257         } else {
 1258                 *retu = true;
 1259                 return (0);
 1260         }
 1261 
 1262         error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
 1263             mread, mwrite, retu);
 1264 
 1265         return (error);
 1266 }
 1267 
 1268 static int
 1269 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
 1270 {
 1271         int i, done;
 1272         struct vcpu *vcpu;
 1273 
 1274         done = 0;
 1275         vcpu = &vm->vcpu[vcpuid];
 1276 
 1277         CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
 1278 
 1279         /*
 1280          * Wait until all 'active_cpus' have suspended themselves.
 1281          *
 1282          * Since a VM may be suspended at any time including when one or
 1283          * more vcpus are doing a rendezvous we need to call the rendezvous
 1284          * handler while we are waiting to prevent a deadlock.
 1285          */
 1286         vcpu_lock(vcpu);
 1287         while (1) {
 1288                 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
 1289                         VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
 1290                         break;
 1291                 }
 1292 
 1293                 if (vm->rendezvous_func == NULL) {
 1294                         VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
 1295                         vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
 1296                         msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
 1297                         vcpu_require_state_locked(vcpu, VCPU_FROZEN);
 1298                 } else {
 1299                         VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
 1300                         vcpu_unlock(vcpu);
 1301                         vm_handle_rendezvous(vm, vcpuid);
 1302                         vcpu_lock(vcpu);
 1303                 }
 1304         }
 1305         vcpu_unlock(vcpu);
 1306 
 1307         /*
 1308          * Wakeup the other sleeping vcpus and return to userspace.
 1309          */
 1310         for (i = 0; i < VM_MAXCPU; i++) {
 1311                 if (CPU_ISSET(i, &vm->suspended_cpus)) {
 1312                         vcpu_notify_event(vm, i, false);
 1313                 }
 1314         }
 1315 
 1316         *retu = true;
 1317         return (0);
 1318 }
 1319 
 1320 int
 1321 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 1322 {
 1323         int i;
 1324 
 1325         if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
 1326                 return (EINVAL);
 1327 
 1328         if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
 1329                 VM_CTR2(vm, "virtual machine already suspended %d/%d",
 1330                     vm->suspend, how);
 1331                 return (EALREADY);
 1332         }
 1333 
 1334         VM_CTR1(vm, "virtual machine successfully suspended %d", how);
 1335 
 1336         /*
 1337          * Notify all active vcpus that they are now suspended.
 1338          */
 1339         for (i = 0; i < VM_MAXCPU; i++) {
 1340                 if (CPU_ISSET(i, &vm->active_cpus))
 1341                         vcpu_notify_event(vm, i, false);
 1342         }
 1343 
 1344         return (0);
 1345 }
 1346 
 1347 void
 1348 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
 1349 {
 1350         struct vm_exit *vmexit;
 1351 
 1352         KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
 1353             ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
 1354 
 1355         vmexit = vm_exitinfo(vm, vcpuid);
 1356         vmexit->rip = rip;
 1357         vmexit->inst_length = 0;
 1358         vmexit->exitcode = VM_EXITCODE_SUSPENDED;
 1359         vmexit->u.suspended.how = vm->suspend;
 1360 }
 1361 
 1362 void
 1363 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
 1364 {
 1365         struct vm_exit *vmexit;
 1366 
 1367         KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
 1368 
 1369         vmexit = vm_exitinfo(vm, vcpuid);
 1370         vmexit->rip = rip;
 1371         vmexit->inst_length = 0;
 1372         vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
 1373         vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
 1374 }
 1375 
 1376 void
 1377 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
 1378 {
 1379         struct vm_exit *vmexit;
 1380 
 1381         vmexit = vm_exitinfo(vm, vcpuid);
 1382         vmexit->rip = rip;
 1383         vmexit->inst_length = 0;
 1384         vmexit->exitcode = VM_EXITCODE_BOGUS;
 1385         vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
 1386 }
 1387 
 1388 int
 1389 vm_run(struct vm *vm, struct vm_run *vmrun)
 1390 {
 1391         int error, vcpuid;
 1392         struct vcpu *vcpu;
 1393         struct pcb *pcb;
 1394         uint64_t tscval, rip;
 1395         struct vm_exit *vme;
 1396         bool retu, intr_disabled;
 1397         pmap_t pmap;
 1398         void *rptr, *sptr;
 1399 
 1400         vcpuid = vmrun->cpuid;
 1401 
 1402         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1403                 return (EINVAL);
 1404 
 1405         if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 1406                 return (EINVAL);
 1407 
 1408         if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
 1409                 return (EINVAL);
 1410 
 1411         rptr = &vm->rendezvous_func;
 1412         sptr = &vm->suspend;
 1413         pmap = vmspace_pmap(vm->vmspace);
 1414         vcpu = &vm->vcpu[vcpuid];
 1415         vme = &vcpu->exitinfo;
 1416         rip = vmrun->rip;
 1417 restart:
 1418         critical_enter();
 1419 
 1420         KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 1421             ("vm_run: absurd pm_active"));
 1422 
 1423         tscval = rdtsc();
 1424 
 1425         pcb = PCPU_GET(curpcb);
 1426         set_pcb_flags(pcb, PCB_FULL_IRET);
 1427 
 1428         restore_guest_msrs(vm, vcpuid); 
 1429         restore_guest_fpustate(vcpu);
 1430 
 1431         vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
 1432         error = VMRUN(vm->cookie, vcpuid, rip, pmap, rptr, sptr);
 1433         vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 1434 
 1435         save_guest_fpustate(vcpu);
 1436         restore_host_msrs(vm, vcpuid);
 1437 
 1438         vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
 1439 
 1440         critical_exit();
 1441 
 1442         if (error == 0) {
 1443                 retu = false;
 1444                 switch (vme->exitcode) {
 1445                 case VM_EXITCODE_SUSPENDED:
 1446                         error = vm_handle_suspend(vm, vcpuid, &retu);
 1447                         break;
 1448                 case VM_EXITCODE_IOAPIC_EOI:
 1449                         vioapic_process_eoi(vm, vcpuid,
 1450                             vme->u.ioapic_eoi.vector);
 1451                         break;
 1452                 case VM_EXITCODE_RENDEZVOUS:
 1453                         vm_handle_rendezvous(vm, vcpuid);
 1454                         error = 0;
 1455                         break;
 1456                 case VM_EXITCODE_HLT:
 1457                         intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 1458                         error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
 1459                         break;
 1460                 case VM_EXITCODE_PAGING:
 1461                         error = vm_handle_paging(vm, vcpuid, &retu);
 1462                         break;
 1463                 case VM_EXITCODE_INST_EMUL:
 1464                         error = vm_handle_inst_emul(vm, vcpuid, &retu);
 1465                         break;
 1466                 case VM_EXITCODE_INOUT:
 1467                 case VM_EXITCODE_INOUT_STR:
 1468                         error = vm_handle_inout(vm, vcpuid, vme, &retu);
 1469                         break;
 1470                 default:
 1471                         retu = true;    /* handled in userland */
 1472                         break;
 1473                 }
 1474         }
 1475 
 1476         if (error == 0 && retu == false) {
 1477                 rip = vme->rip + vme->inst_length;
 1478                 goto restart;
 1479         }
 1480 
 1481         /* copy the exit information */
 1482         bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
 1483         return (error);
 1484 }
 1485 
 1486 int
 1487 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
 1488 {
 1489         struct vcpu *vcpu;
 1490         int type, vector;
 1491 
 1492         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1493                 return (EINVAL);
 1494 
 1495         vcpu = &vm->vcpu[vcpuid];
 1496 
 1497         if (info & VM_INTINFO_VALID) {
 1498                 type = info & VM_INTINFO_TYPE;
 1499                 vector = info & 0xff;
 1500                 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
 1501                         return (EINVAL);
 1502                 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
 1503                         return (EINVAL);
 1504                 if (info & VM_INTINFO_RSVD)
 1505                         return (EINVAL);
 1506         } else {
 1507                 info = 0;
 1508         }
 1509         VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
 1510         vcpu->exitintinfo = info;
 1511         return (0);
 1512 }
 1513 
 1514 enum exc_class {
 1515         EXC_BENIGN,
 1516         EXC_CONTRIBUTORY,
 1517         EXC_PAGEFAULT
 1518 };
 1519 
 1520 #define IDT_VE  20      /* Virtualization Exception (Intel specific) */
 1521 
 1522 static enum exc_class
 1523 exception_class(uint64_t info)
 1524 {
 1525         int type, vector;
 1526 
 1527         KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
 1528         type = info & VM_INTINFO_TYPE;
 1529         vector = info & 0xff;
 1530 
 1531         /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
 1532         switch (type) {
 1533         case VM_INTINFO_HWINTR:
 1534         case VM_INTINFO_SWINTR:
 1535         case VM_INTINFO_NMI:
 1536                 return (EXC_BENIGN);
 1537         default:
 1538                 /*
 1539                  * Hardware exception.
 1540                  *
 1541                  * SVM and VT-x use identical type values to represent NMI,
 1542                  * hardware interrupt and software interrupt.
 1543                  *
 1544                  * SVM uses type '3' for all exceptions. VT-x uses type '3'
 1545                  * for exceptions except #BP and #OF. #BP and #OF use a type
 1546                  * value of '5' or '6'. Therefore we don't check for explicit
 1547                  * values of 'type' to classify 'intinfo' into a hardware
 1548                  * exception.
 1549                  */
 1550                 break;
 1551         }
 1552 
 1553         switch (vector) {
 1554         case IDT_PF:
 1555         case IDT_VE:
 1556                 return (EXC_PAGEFAULT);
 1557         case IDT_DE:
 1558         case IDT_TS:
 1559         case IDT_NP:
 1560         case IDT_SS:
 1561         case IDT_GP:
 1562                 return (EXC_CONTRIBUTORY);
 1563         default:
 1564                 return (EXC_BENIGN);
 1565         }
 1566 }
 1567 
 1568 static int
 1569 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
 1570     uint64_t *retinfo)
 1571 {
 1572         enum exc_class exc1, exc2;
 1573         int type1, vector1;
 1574 
 1575         KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
 1576         KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
 1577 
 1578         /*
 1579          * If an exception occurs while attempting to call the double-fault
 1580          * handler the processor enters shutdown mode (aka triple fault).
 1581          */
 1582         type1 = info1 & VM_INTINFO_TYPE;
 1583         vector1 = info1 & 0xff;
 1584         if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
 1585                 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
 1586                     info1, info2);
 1587                 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
 1588                 *retinfo = 0;
 1589                 return (0);
 1590         }
 1591 
 1592         /*
 1593          * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
 1594          */
 1595         exc1 = exception_class(info1);
 1596         exc2 = exception_class(info2);
 1597         if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
 1598             (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
 1599                 /* Convert nested fault into a double fault. */
 1600                 *retinfo = IDT_DF;
 1601                 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 1602                 *retinfo |= VM_INTINFO_DEL_ERRCODE;
 1603         } else {
 1604                 /* Handle exceptions serially */
 1605                 *retinfo = info2;
 1606         }
 1607         return (1);
 1608 }
 1609 
 1610 static uint64_t
 1611 vcpu_exception_intinfo(struct vcpu *vcpu)
 1612 {
 1613         uint64_t info = 0;
 1614 
 1615         if (vcpu->exception_pending) {
 1616                 info = vcpu->exception.vector & 0xff;
 1617                 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
 1618                 if (vcpu->exception.error_code_valid) {
 1619                         info |= VM_INTINFO_DEL_ERRCODE;
 1620                         info |= (uint64_t)vcpu->exception.error_code << 32;
 1621                 }
 1622         }
 1623         return (info);
 1624 }
 1625 
 1626 int
 1627 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
 1628 {
 1629         struct vcpu *vcpu;
 1630         uint64_t info1, info2;
 1631         int valid;
 1632 
 1633         KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
 1634 
 1635         vcpu = &vm->vcpu[vcpuid];
 1636 
 1637         info1 = vcpu->exitintinfo;
 1638         vcpu->exitintinfo = 0;
 1639 
 1640         info2 = 0;
 1641         if (vcpu->exception_pending) {
 1642                 info2 = vcpu_exception_intinfo(vcpu);
 1643                 vcpu->exception_pending = 0;
 1644                 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
 1645                     vcpu->exception.vector, info2);
 1646         }
 1647 
 1648         if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
 1649                 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
 1650         } else if (info1 & VM_INTINFO_VALID) {
 1651                 *retinfo = info1;
 1652                 valid = 1;
 1653         } else if (info2 & VM_INTINFO_VALID) {
 1654                 *retinfo = info2;
 1655                 valid = 1;
 1656         } else {
 1657                 valid = 0;
 1658         }
 1659 
 1660         if (valid) {
 1661                 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
 1662                     "retinfo(%#lx)", __func__, info1, info2, *retinfo);
 1663         }
 1664 
 1665         return (valid);
 1666 }
 1667 
 1668 int
 1669 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
 1670 {
 1671         struct vcpu *vcpu;
 1672 
 1673         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1674                 return (EINVAL);
 1675 
 1676         vcpu = &vm->vcpu[vcpuid];
 1677         *info1 = vcpu->exitintinfo;
 1678         *info2 = vcpu_exception_intinfo(vcpu);
 1679         return (0);
 1680 }
 1681 
 1682 int
 1683 vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
 1684 {
 1685         struct vcpu *vcpu;
 1686 
 1687         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1688                 return (EINVAL);
 1689 
 1690         if (exception->vector < 0 || exception->vector >= 32)
 1691                 return (EINVAL);
 1692 
 1693         /*
 1694          * A double fault exception should never be injected directly into
 1695          * the guest. It is a derived exception that results from specific
 1696          * combinations of nested faults.
 1697          */
 1698         if (exception->vector == IDT_DF)
 1699                 return (EINVAL);
 1700 
 1701         vcpu = &vm->vcpu[vcpuid];
 1702 
 1703         if (vcpu->exception_pending) {
 1704                 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
 1705                     "pending exception %d", exception->vector,
 1706                     vcpu->exception.vector);
 1707                 return (EBUSY);
 1708         }
 1709 
 1710         vcpu->exception_pending = 1;
 1711         vcpu->exception = *exception;
 1712         VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
 1713         return (0);
 1714 }
 1715 
 1716 void
 1717 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
 1718     int errcode)
 1719 {
 1720         struct vm_exception exception;
 1721         struct vm_exit *vmexit;
 1722         struct vm *vm;
 1723         int error;
 1724 
 1725         vm = vmarg;
 1726 
 1727         exception.vector = vector;
 1728         exception.error_code = errcode;
 1729         exception.error_code_valid = errcode_valid;
 1730         error = vm_inject_exception(vm, vcpuid, &exception);
 1731         KASSERT(error == 0, ("vm_inject_exception error %d", error));
 1732 
 1733         /*
 1734          * A fault-like exception allows the instruction to be restarted
 1735          * after the exception handler returns.
 1736          *
 1737          * By setting the inst_length to 0 we ensure that the instruction
 1738          * pointer remains at the faulting instruction.
 1739          */
 1740         vmexit = vm_exitinfo(vm, vcpuid);
 1741         vmexit->inst_length = 0;
 1742 }
 1743 
 1744 void
 1745 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
 1746 {
 1747         struct vm *vm;
 1748         int error;
 1749 
 1750         vm = vmarg;
 1751         VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
 1752             error_code, cr2);
 1753 
 1754         error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
 1755         KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
 1756 
 1757         vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
 1758 }
 1759 
 1760 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
 1761 
 1762 int
 1763 vm_inject_nmi(struct vm *vm, int vcpuid)
 1764 {
 1765         struct vcpu *vcpu;
 1766 
 1767         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1768                 return (EINVAL);
 1769 
 1770         vcpu = &vm->vcpu[vcpuid];
 1771 
 1772         vcpu->nmi_pending = 1;
 1773         vcpu_notify_event(vm, vcpuid, false);
 1774         return (0);
 1775 }
 1776 
 1777 int
 1778 vm_nmi_pending(struct vm *vm, int vcpuid)
 1779 {
 1780         struct vcpu *vcpu;
 1781 
 1782         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1783                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 1784 
 1785         vcpu = &vm->vcpu[vcpuid];
 1786 
 1787         return (vcpu->nmi_pending);
 1788 }
 1789 
 1790 void
 1791 vm_nmi_clear(struct vm *vm, int vcpuid)
 1792 {
 1793         struct vcpu *vcpu;
 1794 
 1795         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1796                 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
 1797 
 1798         vcpu = &vm->vcpu[vcpuid];
 1799 
 1800         if (vcpu->nmi_pending == 0)
 1801                 panic("vm_nmi_clear: inconsistent nmi_pending state");
 1802 
 1803         vcpu->nmi_pending = 0;
 1804         vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
 1805 }
 1806 
 1807 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
 1808 
 1809 int
 1810 vm_inject_extint(struct vm *vm, int vcpuid)
 1811 {
 1812         struct vcpu *vcpu;
 1813 
 1814         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1815                 return (EINVAL);
 1816 
 1817         vcpu = &vm->vcpu[vcpuid];
 1818 
 1819         vcpu->extint_pending = 1;
 1820         vcpu_notify_event(vm, vcpuid, false);
 1821         return (0);
 1822 }
 1823 
 1824 int
 1825 vm_extint_pending(struct vm *vm, int vcpuid)
 1826 {
 1827         struct vcpu *vcpu;
 1828 
 1829         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1830                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 1831 
 1832         vcpu = &vm->vcpu[vcpuid];
 1833 
 1834         return (vcpu->extint_pending);
 1835 }
 1836 
 1837 void
 1838 vm_extint_clear(struct vm *vm, int vcpuid)
 1839 {
 1840         struct vcpu *vcpu;
 1841 
 1842         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1843                 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
 1844 
 1845         vcpu = &vm->vcpu[vcpuid];
 1846 
 1847         if (vcpu->extint_pending == 0)
 1848                 panic("vm_extint_clear: inconsistent extint_pending state");
 1849 
 1850         vcpu->extint_pending = 0;
 1851         vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
 1852 }
 1853 
 1854 int
 1855 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 1856 {
 1857         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 1858                 return (EINVAL);
 1859 
 1860         if (type < 0 || type >= VM_CAP_MAX)
 1861                 return (EINVAL);
 1862 
 1863         return (VMGETCAP(vm->cookie, vcpu, type, retval));
 1864 }
 1865 
 1866 int
 1867 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
 1868 {
 1869         if (vcpu < 0 || vcpu >= VM_MAXCPU)
 1870                 return (EINVAL);
 1871 
 1872         if (type < 0 || type >= VM_CAP_MAX)
 1873                 return (EINVAL);
 1874 
 1875         return (VMSETCAP(vm->cookie, vcpu, type, val));
 1876 }
 1877 
 1878 uint64_t *
 1879 vm_guest_msrs(struct vm *vm, int cpu)
 1880 {
 1881         return (vm->vcpu[cpu].guest_msrs);
 1882 }
 1883 
 1884 struct vlapic *
 1885 vm_lapic(struct vm *vm, int cpu)
 1886 {
 1887         return (vm->vcpu[cpu].vlapic);
 1888 }
 1889 
 1890 struct vioapic *
 1891 vm_ioapic(struct vm *vm)
 1892 {
 1893 
 1894         return (vm->vioapic);
 1895 }
 1896 
 1897 struct vhpet *
 1898 vm_hpet(struct vm *vm)
 1899 {
 1900 
 1901         return (vm->vhpet);
 1902 }
 1903 
 1904 boolean_t
 1905 vmm_is_pptdev(int bus, int slot, int func)
 1906 {
 1907         int found, i, n;
 1908         int b, s, f;
 1909         char *val, *cp, *cp2;
 1910 
 1911         /*
 1912          * XXX
 1913          * The length of an environment variable is limited to 128 bytes which
 1914          * puts an upper limit on the number of passthru devices that may be
 1915          * specified using a single environment variable.
 1916          *
 1917          * Work around this by scanning multiple environment variable
 1918          * names instead of a single one - yuck!
 1919          */
 1920         const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
 1921 
 1922         /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
 1923         found = 0;
 1924         for (i = 0; names[i] != NULL && !found; i++) {
 1925                 cp = val = getenv(names[i]);
 1926                 while (cp != NULL && *cp != '\0') {
 1927                         if ((cp2 = strchr(cp, ' ')) != NULL)
 1928                                 *cp2 = '\0';
 1929 
 1930                         n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
 1931                         if (n == 3 && bus == b && slot == s && func == f) {
 1932                                 found = 1;
 1933                                 break;
 1934                         }
 1935                 
 1936                         if (cp2 != NULL)
 1937                                 *cp2++ = ' ';
 1938 
 1939                         cp = cp2;
 1940                 }
 1941                 freeenv(val);
 1942         }
 1943         return (found);
 1944 }
 1945 
 1946 void *
 1947 vm_iommu_domain(struct vm *vm)
 1948 {
 1949 
 1950         return (vm->iommu);
 1951 }
 1952 
 1953 int
 1954 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
 1955     bool from_idle)
 1956 {
 1957         int error;
 1958         struct vcpu *vcpu;
 1959 
 1960         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1961                 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
 1962 
 1963         vcpu = &vm->vcpu[vcpuid];
 1964 
 1965         vcpu_lock(vcpu);
 1966         error = vcpu_set_state_locked(vcpu, newstate, from_idle);
 1967         vcpu_unlock(vcpu);
 1968 
 1969         return (error);
 1970 }
 1971 
 1972 enum vcpu_state
 1973 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
 1974 {
 1975         struct vcpu *vcpu;
 1976         enum vcpu_state state;
 1977 
 1978         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1979                 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
 1980 
 1981         vcpu = &vm->vcpu[vcpuid];
 1982 
 1983         vcpu_lock(vcpu);
 1984         state = vcpu->state;
 1985         if (hostcpu != NULL)
 1986                 *hostcpu = vcpu->hostcpu;
 1987         vcpu_unlock(vcpu);
 1988 
 1989         return (state);
 1990 }
 1991 
 1992 int
 1993 vm_activate_cpu(struct vm *vm, int vcpuid)
 1994 {
 1995 
 1996         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 1997                 return (EINVAL);
 1998 
 1999         if (CPU_ISSET(vcpuid, &vm->active_cpus))
 2000                 return (EBUSY);
 2001 
 2002         VCPU_CTR0(vm, vcpuid, "activated");
 2003         CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
 2004         return (0);
 2005 }
 2006 
 2007 cpuset_t
 2008 vm_active_cpus(struct vm *vm)
 2009 {
 2010 
 2011         return (vm->active_cpus);
 2012 }
 2013 
 2014 cpuset_t
 2015 vm_suspended_cpus(struct vm *vm)
 2016 {
 2017 
 2018         return (vm->suspended_cpus);
 2019 }
 2020 
 2021 void *
 2022 vcpu_stats(struct vm *vm, int vcpuid)
 2023 {
 2024 
 2025         return (vm->vcpu[vcpuid].stats);
 2026 }
 2027 
 2028 int
 2029 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
 2030 {
 2031         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 2032                 return (EINVAL);
 2033 
 2034         *state = vm->vcpu[vcpuid].x2apic_state;
 2035 
 2036         return (0);
 2037 }
 2038 
 2039 int
 2040 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
 2041 {
 2042         if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
 2043                 return (EINVAL);
 2044 
 2045         if (state >= X2APIC_STATE_LAST)
 2046                 return (EINVAL);
 2047 
 2048         vm->vcpu[vcpuid].x2apic_state = state;
 2049 
 2050         vlapic_set_x2apic_state(vm, vcpuid, state);
 2051 
 2052         return (0);
 2053 }
 2054 
 2055 /*
 2056  * This function is called to ensure that a vcpu "sees" a pending event
 2057  * as soon as possible:
 2058  * - If the vcpu thread is sleeping then it is woken up.
 2059  * - If the vcpu is running on a different host_cpu then an IPI will be directed
 2060  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
 2061  */
 2062 void
 2063 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
 2064 {
 2065         int hostcpu;
 2066         struct vcpu *vcpu;
 2067 
 2068         vcpu = &vm->vcpu[vcpuid];
 2069 
 2070         vcpu_lock(vcpu);
 2071         hostcpu = vcpu->hostcpu;
 2072         if (vcpu->state == VCPU_RUNNING) {
 2073                 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 2074                 if (hostcpu != curcpu) {
 2075                         if (lapic_intr) {
 2076                                 vlapic_post_intr(vcpu->vlapic, hostcpu,
 2077                                     vmm_ipinum);
 2078                         } else {
 2079                                 ipi_cpu(hostcpu, vmm_ipinum);
 2080                         }
 2081                 } else {
 2082                         /*
 2083                          * If the 'vcpu' is running on 'curcpu' then it must
 2084                          * be sending a notification to itself (e.g. SELF_IPI).
 2085                          * The pending event will be picked up when the vcpu
 2086                          * transitions back to guest context.
 2087                          */
 2088                 }
 2089         } else {
 2090                 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
 2091                     "with hostcpu %d", vcpu->state, hostcpu));
 2092                 if (vcpu->state == VCPU_SLEEPING)
 2093                         wakeup_one(vcpu);
 2094         }
 2095         vcpu_unlock(vcpu);
 2096 }
 2097 
 2098 struct vmspace *
 2099 vm_get_vmspace(struct vm *vm)
 2100 {
 2101 
 2102         return (vm->vmspace);
 2103 }
 2104 
 2105 int
 2106 vm_apicid2vcpuid(struct vm *vm, int apicid)
 2107 {
 2108         /*
 2109          * XXX apic id is assumed to be numerically identical to vcpu id
 2110          */
 2111         return (apicid);
 2112 }
 2113 
 2114 void
 2115 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
 2116     vm_rendezvous_func_t func, void *arg)
 2117 {
 2118         int i;
 2119 
 2120         /*
 2121          * Enforce that this function is called without any locks
 2122          */
 2123         WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
 2124         KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
 2125             ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
 2126 
 2127 restart:
 2128         mtx_lock(&vm->rendezvous_mtx);
 2129         if (vm->rendezvous_func != NULL) {
 2130                 /*
 2131                  * If a rendezvous is already in progress then we need to
 2132                  * call the rendezvous handler in case this 'vcpuid' is one
 2133                  * of the targets of the rendezvous.
 2134                  */
 2135                 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
 2136                 mtx_unlock(&vm->rendezvous_mtx);
 2137                 vm_handle_rendezvous(vm, vcpuid);
 2138                 goto restart;
 2139         }
 2140         KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
 2141             "rendezvous is still in progress"));
 2142 
 2143         RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
 2144         vm->rendezvous_req_cpus = dest;
 2145         CPU_ZERO(&vm->rendezvous_done_cpus);
 2146         vm->rendezvous_arg = arg;
 2147         vm_set_rendezvous_func(vm, func);
 2148         mtx_unlock(&vm->rendezvous_mtx);
 2149 
 2150         /*
 2151          * Wake up any sleeping vcpus and trigger a VM-exit in any running
 2152          * vcpus so they handle the rendezvous as soon as possible.
 2153          */
 2154         for (i = 0; i < VM_MAXCPU; i++) {
 2155                 if (CPU_ISSET(i, &dest))
 2156                         vcpu_notify_event(vm, i, false);
 2157         }
 2158 
 2159         vm_handle_rendezvous(vm, vcpuid);
 2160 }
 2161 
 2162 struct vatpic *
 2163 vm_atpic(struct vm *vm)
 2164 {
 2165         return (vm->vatpic);
 2166 }
 2167 
 2168 struct vatpit *
 2169 vm_atpit(struct vm *vm)
 2170 {
 2171         return (vm->vatpit);
 2172 }
 2173 
 2174 enum vm_reg_name
 2175 vm_segment_name(int seg)
 2176 {
 2177         static enum vm_reg_name seg_names[] = {
 2178                 VM_REG_GUEST_ES,
 2179                 VM_REG_GUEST_CS,
 2180                 VM_REG_GUEST_SS,
 2181                 VM_REG_GUEST_DS,
 2182                 VM_REG_GUEST_FS,
 2183                 VM_REG_GUEST_GS
 2184         };
 2185 
 2186         KASSERT(seg >= 0 && seg < nitems(seg_names),
 2187             ("%s: invalid segment encoding %d", __func__, seg));
 2188         return (seg_names[seg]);
 2189 }
 2190 
 2191 void
 2192 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
 2193     int num_copyinfo)
 2194 {
 2195         int idx;
 2196 
 2197         for (idx = 0; idx < num_copyinfo; idx++) {
 2198                 if (copyinfo[idx].cookie != NULL)
 2199                         vm_gpa_release(copyinfo[idx].cookie);
 2200         }
 2201         bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
 2202 }
 2203 
 2204 int
 2205 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 2206     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
 2207     int num_copyinfo)
 2208 {
 2209         int error, idx, nused;
 2210         size_t n, off, remaining;
 2211         void *hva, *cookie;
 2212         uint64_t gpa;
 2213 
 2214         bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
 2215 
 2216         nused = 0;
 2217         remaining = len;
 2218         while (remaining > 0) {
 2219                 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
 2220                 error = vmm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
 2221                 if (error)
 2222                         return (error);
 2223                 off = gpa & PAGE_MASK;
 2224                 n = min(remaining, PAGE_SIZE - off);
 2225                 copyinfo[nused].gpa = gpa;
 2226                 copyinfo[nused].len = n;
 2227                 remaining -= n;
 2228                 gla += n;
 2229                 nused++;
 2230         }
 2231 
 2232         for (idx = 0; idx < nused; idx++) {
 2233                 hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
 2234                     prot, &cookie);
 2235                 if (hva == NULL)
 2236                         break;
 2237                 copyinfo[idx].hva = hva;
 2238                 copyinfo[idx].cookie = cookie;
 2239         }
 2240 
 2241         if (idx != nused) {
 2242                 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
 2243                 return (-1);
 2244         } else {
 2245                 return (0);
 2246         }
 2247 }
 2248 
 2249 void
 2250 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
 2251     size_t len)
 2252 {
 2253         char *dst;
 2254         int idx;
 2255         
 2256         dst = kaddr;
 2257         idx = 0;
 2258         while (len > 0) {
 2259                 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
 2260                 len -= copyinfo[idx].len;
 2261                 dst += copyinfo[idx].len;
 2262                 idx++;
 2263         }
 2264 }
 2265 
 2266 void
 2267 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
 2268     struct vm_copyinfo *copyinfo, size_t len)
 2269 {
 2270         const char *src;
 2271         int idx;
 2272 
 2273         src = kaddr;
 2274         idx = 0;
 2275         while (len > 0) {
 2276                 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
 2277                 len -= copyinfo[idx].len;
 2278                 src += copyinfo[idx].len;
 2279                 idx++;
 2280         }
 2281 }
 2282 
 2283 /*
 2284  * Return the amount of in-use and wired memory for the VM. Since
 2285  * these are global stats, only return the values with for vCPU 0
 2286  */
 2287 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
 2288 VMM_STAT_DECLARE(VMM_MEM_WIRED);
 2289 
 2290 static void
 2291 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 2292 {
 2293 
 2294         if (vcpu == 0) {
 2295                 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
 2296                     PAGE_SIZE * vmspace_resident_count(vm->vmspace));
 2297         }       
 2298 }
 2299 
 2300 static void
 2301 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
 2302 {
 2303 
 2304         if (vcpu == 0) {
 2305                 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
 2306                     PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
 2307         }       
 2308 }
 2309 
 2310 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
 2311 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Cache object: a3c0edd0ebe94907ae765318ca701c45
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/amd64/vmm/vmm.c

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/vmm.c