FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/vmm.c
1 /*-
2 * Copyright (c) 2011 NetApp, Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: releng/11.2/sys/amd64/vmm/vmm.c 331722 2018-03-29 02:50:57Z eadler $
27 */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD: releng/11.2/sys/amd64/vmm/vmm.c 331722 2018-03-29 02:50:57Z eadler $");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/sysctl.h>
37 #include <sys/malloc.h>
38 #include <sys/pcpu.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/rwlock.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/systm.h>
46
47 #include <vm/vm.h>
48 #include <vm/vm_object.h>
49 #include <vm/vm_page.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_param.h>
54
55 #include <machine/cpu.h>
56 #include <machine/pcb.h>
57 #include <machine/smp.h>
58 #include <machine/md_var.h>
59 #include <x86/psl.h>
60 #include <x86/apicreg.h>
61
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <machine/vmm_instruction_emul.h>
65
66 #include "vmm_ioport.h"
67 #include "vmm_ktr.h"
68 #include "vmm_host.h"
69 #include "vmm_mem.h"
70 #include "vmm_util.h"
71 #include "vatpic.h"
72 #include "vatpit.h"
73 #include "vhpet.h"
74 #include "vioapic.h"
75 #include "vlapic.h"
76 #include "vpmtmr.h"
77 #include "vrtc.h"
78 #include "vmm_stat.h"
79 #include "vmm_lapic.h"
80
81 #include "io/ppt.h"
82 #include "io/iommu.h"
83
84 struct vlapic;
85
86 /*
87 * Initialization:
88 * (a) allocated when vcpu is created
89 * (i) initialized when vcpu is created and when it is reinitialized
90 * (o) initialized the first time the vcpu is created
91 * (x) initialized before use
92 */
93 struct vcpu {
94 struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */
95 enum vcpu_state state; /* (o) vcpu state */
96 int hostcpu; /* (o) vcpu's host cpu */
97 int reqidle; /* (i) request vcpu to idle */
98 struct vlapic *vlapic; /* (i) APIC device model */
99 enum x2apic_state x2apic_state; /* (i) APIC mode */
100 uint64_t exitintinfo; /* (i) events pending at VM exit */
101 int nmi_pending; /* (i) NMI pending */
102 int extint_pending; /* (i) INTR pending */
103 int exception_pending; /* (i) exception pending */
104 int exc_vector; /* (x) exception collateral */
105 int exc_errcode_valid;
106 uint32_t exc_errcode;
107 struct savefpu *guestfpu; /* (a,i) guest fpu state */
108 uint64_t guest_xcr0; /* (i) guest %xcr0 register */
109 void *stats; /* (a,i) statistics */
110 struct vm_exit exitinfo; /* (x) exit reason and collateral */
111 uint64_t nextrip; /* (x) next instruction to execute */
112 };
113
114 #define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
115 #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
116 #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
117 #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
118 #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
119
120 struct mem_seg {
121 size_t len;
122 bool sysmem;
123 struct vm_object *object;
124 };
125 #define VM_MAX_MEMSEGS 3
126
127 struct mem_map {
128 vm_paddr_t gpa;
129 size_t len;
130 vm_ooffset_t segoff;
131 int segid;
132 int prot;
133 int flags;
134 };
135 #define VM_MAX_MEMMAPS 4
136
137 /*
138 * Initialization:
139 * (o) initialized the first time the VM is created
140 * (i) initialized when VM is created and when it is reinitialized
141 * (x) initialized before use
142 */
143 struct vm {
144 void *cookie; /* (i) cpu-specific data */
145 void *iommu; /* (x) iommu-specific data */
146 struct vhpet *vhpet; /* (i) virtual HPET */
147 struct vioapic *vioapic; /* (i) virtual ioapic */
148 struct vatpic *vatpic; /* (i) virtual atpic */
149 struct vatpit *vatpit; /* (i) virtual atpit */
150 struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */
151 struct vrtc *vrtc; /* (o) virtual RTC */
152 volatile cpuset_t active_cpus; /* (i) active vcpus */
153 int suspend; /* (i) stop VM execution */
154 volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
155 volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
156 cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
157 cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
158 void *rendezvous_arg; /* (x) rendezvous func/arg */
159 vm_rendezvous_func_t rendezvous_func;
160 struct mtx rendezvous_mtx; /* (o) rendezvous lock */
161 struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
162 struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
163 struct vmspace *vmspace; /* (o) guest's address space */
164 char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */
165 struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */
166 };
167
168 static int vmm_initialized;
169
170 static struct vmm_ops *ops;
171 #define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0)
172 #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
173 #define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0)
174
175 #define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
176 #define VMRUN(vmi, vcpu, rip, pmap, evinfo) \
177 (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
178 #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
179 #define VMSPACE_ALLOC(min, max) \
180 (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
181 #define VMSPACE_FREE(vmspace) \
182 (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
183 #define VMGETREG(vmi, vcpu, num, retval) \
184 (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
185 #define VMSETREG(vmi, vcpu, num, val) \
186 (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
187 #define VMGETDESC(vmi, vcpu, num, desc) \
188 (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
189 #define VMSETDESC(vmi, vcpu, num, desc) \
190 (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
191 #define VMGETCAP(vmi, vcpu, num, retval) \
192 (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
193 #define VMSETCAP(vmi, vcpu, num, val) \
194 (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
195 #define VLAPIC_INIT(vmi, vcpu) \
196 (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
197 #define VLAPIC_CLEANUP(vmi, vlapic) \
198 (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
199
200 #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
201 #define fpu_stop_emulating() clts()
202
203 static MALLOC_DEFINE(M_VM, "vm", "vm");
204
205 /* statistics */
206 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
207
208 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
209
210 /*
211 * Halt the guest if all vcpus are executing a HLT instruction with
212 * interrupts disabled.
213 */
214 static int halt_detection_enabled = 1;
215 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
216 &halt_detection_enabled, 0,
217 "Halt VM if all vcpus execute HLT with interrupts disabled");
218
219 static int vmm_ipinum;
220 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
221 "IPI vector used for vcpu notifications");
222
223 static int trace_guest_exceptions;
224 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
225 &trace_guest_exceptions, 0,
226 "Trap into hypervisor on all guest exceptions and reflect them back");
227
228 static void vm_free_memmap(struct vm *vm, int ident);
229 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
230 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
231
232 #ifdef KTR
233 static const char *
234 vcpu_state2str(enum vcpu_state state)
235 {
236
237 switch (state) {
238 case VCPU_IDLE:
239 return ("idle");
240 case VCPU_FROZEN:
241 return ("frozen");
242 case VCPU_RUNNING:
243 return ("running");
244 case VCPU_SLEEPING:
245 return ("sleeping");
246 default:
247 return ("unknown");
248 }
249 }
250 #endif
251
252 static void
253 vcpu_cleanup(struct vm *vm, int i, bool destroy)
254 {
255 struct vcpu *vcpu = &vm->vcpu[i];
256
257 VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
258 if (destroy) {
259 vmm_stat_free(vcpu->stats);
260 fpu_save_area_free(vcpu->guestfpu);
261 }
262 }
263
264 static void
265 vcpu_init(struct vm *vm, int vcpu_id, bool create)
266 {
267 struct vcpu *vcpu;
268
269 KASSERT(vcpu_id >= 0 && vcpu_id < VM_MAXCPU,
270 ("vcpu_init: invalid vcpu %d", vcpu_id));
271
272 vcpu = &vm->vcpu[vcpu_id];
273
274 if (create) {
275 KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
276 "initialized", vcpu_id));
277 vcpu_lock_init(vcpu);
278 vcpu->state = VCPU_IDLE;
279 vcpu->hostcpu = NOCPU;
280 vcpu->guestfpu = fpu_save_area_alloc();
281 vcpu->stats = vmm_stat_alloc();
282 }
283
284 vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
285 vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
286 vcpu->reqidle = 0;
287 vcpu->exitintinfo = 0;
288 vcpu->nmi_pending = 0;
289 vcpu->extint_pending = 0;
290 vcpu->exception_pending = 0;
291 vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
292 fpu_save_area_reset(vcpu->guestfpu);
293 vmm_stat_init(vcpu->stats);
294 }
295
296 int
297 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
298 {
299
300 return (trace_guest_exceptions);
301 }
302
303 struct vm_exit *
304 vm_exitinfo(struct vm *vm, int cpuid)
305 {
306 struct vcpu *vcpu;
307
308 if (cpuid < 0 || cpuid >= VM_MAXCPU)
309 panic("vm_exitinfo: invalid cpuid %d", cpuid);
310
311 vcpu = &vm->vcpu[cpuid];
312
313 return (&vcpu->exitinfo);
314 }
315
316 static void
317 vmm_resume(void)
318 {
319 VMM_RESUME();
320 }
321
322 static int
323 vmm_init(void)
324 {
325 int error;
326
327 vmm_host_state_init();
328
329 vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
330 &IDTVEC(justreturn));
331 if (vmm_ipinum < 0)
332 vmm_ipinum = IPI_AST;
333
334 error = vmm_mem_init();
335 if (error)
336 return (error);
337
338 if (vmm_is_intel())
339 ops = &vmm_ops_intel;
340 else if (vmm_is_amd())
341 ops = &vmm_ops_amd;
342 else
343 return (ENXIO);
344
345 vmm_resume_p = vmm_resume;
346
347 return (VMM_INIT(vmm_ipinum));
348 }
349
350 static int
351 vmm_handler(module_t mod, int what, void *arg)
352 {
353 int error;
354
355 switch (what) {
356 case MOD_LOAD:
357 vmmdev_init();
358 error = vmm_init();
359 if (error == 0)
360 vmm_initialized = 1;
361 break;
362 case MOD_UNLOAD:
363 error = vmmdev_cleanup();
364 if (error == 0) {
365 vmm_resume_p = NULL;
366 iommu_cleanup();
367 if (vmm_ipinum != IPI_AST)
368 lapic_ipi_free(vmm_ipinum);
369 error = VMM_CLEANUP();
370 /*
371 * Something bad happened - prevent new
372 * VMs from being created
373 */
374 if (error)
375 vmm_initialized = 0;
376 }
377 break;
378 default:
379 error = 0;
380 break;
381 }
382 return (error);
383 }
384
385 static moduledata_t vmm_kmod = {
386 "vmm",
387 vmm_handler,
388 NULL
389 };
390
391 /*
392 * vmm initialization has the following dependencies:
393 *
394 * - VT-x initialization requires smp_rendezvous() and therefore must happen
395 * after SMP is fully functional (after SI_SUB_SMP).
396 */
397 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
398 MODULE_VERSION(vmm, 1);
399
400 static void
401 vm_init(struct vm *vm, bool create)
402 {
403 int i;
404
405 vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
406 vm->iommu = NULL;
407 vm->vioapic = vioapic_init(vm);
408 vm->vhpet = vhpet_init(vm);
409 vm->vatpic = vatpic_init(vm);
410 vm->vatpit = vatpit_init(vm);
411 vm->vpmtmr = vpmtmr_init(vm);
412 if (create)
413 vm->vrtc = vrtc_init(vm);
414
415 CPU_ZERO(&vm->active_cpus);
416
417 vm->suspend = 0;
418 CPU_ZERO(&vm->suspended_cpus);
419
420 for (i = 0; i < VM_MAXCPU; i++)
421 vcpu_init(vm, i, create);
422 }
423
424 int
425 vm_create(const char *name, struct vm **retvm)
426 {
427 struct vm *vm;
428 struct vmspace *vmspace;
429
430 /*
431 * If vmm.ko could not be successfully initialized then don't attempt
432 * to create the virtual machine.
433 */
434 if (!vmm_initialized)
435 return (ENXIO);
436
437 if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
438 return (EINVAL);
439
440 vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
441 if (vmspace == NULL)
442 return (ENOMEM);
443
444 vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
445 strcpy(vm->name, name);
446 vm->vmspace = vmspace;
447 mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
448
449 vm_init(vm, true);
450
451 *retvm = vm;
452 return (0);
453 }
454
455 static void
456 vm_cleanup(struct vm *vm, bool destroy)
457 {
458 struct mem_map *mm;
459 int i;
460
461 ppt_unassign_all(vm);
462
463 if (vm->iommu != NULL)
464 iommu_destroy_domain(vm->iommu);
465
466 if (destroy)
467 vrtc_cleanup(vm->vrtc);
468 else
469 vrtc_reset(vm->vrtc);
470 vpmtmr_cleanup(vm->vpmtmr);
471 vatpit_cleanup(vm->vatpit);
472 vhpet_cleanup(vm->vhpet);
473 vatpic_cleanup(vm->vatpic);
474 vioapic_cleanup(vm->vioapic);
475
476 for (i = 0; i < VM_MAXCPU; i++)
477 vcpu_cleanup(vm, i, destroy);
478
479 VMCLEANUP(vm->cookie);
480
481 /*
482 * System memory is removed from the guest address space only when
483 * the VM is destroyed. This is because the mapping remains the same
484 * across VM reset.
485 *
486 * Device memory can be relocated by the guest (e.g. using PCI BARs)
487 * so those mappings are removed on a VM reset.
488 */
489 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
490 mm = &vm->mem_maps[i];
491 if (destroy || !sysmem_mapping(vm, mm))
492 vm_free_memmap(vm, i);
493 }
494
495 if (destroy) {
496 for (i = 0; i < VM_MAX_MEMSEGS; i++)
497 vm_free_memseg(vm, i);
498
499 VMSPACE_FREE(vm->vmspace);
500 vm->vmspace = NULL;
501 }
502 }
503
504 void
505 vm_destroy(struct vm *vm)
506 {
507 vm_cleanup(vm, true);
508 free(vm, M_VM);
509 }
510
511 int
512 vm_reinit(struct vm *vm)
513 {
514 int error;
515
516 /*
517 * A virtual machine can be reset only if all vcpus are suspended.
518 */
519 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
520 vm_cleanup(vm, false);
521 vm_init(vm, false);
522 error = 0;
523 } else {
524 error = EBUSY;
525 }
526
527 return (error);
528 }
529
530 const char *
531 vm_name(struct vm *vm)
532 {
533 return (vm->name);
534 }
535
536 int
537 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
538 {
539 vm_object_t obj;
540
541 if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
542 return (ENOMEM);
543 else
544 return (0);
545 }
546
547 int
548 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
549 {
550
551 vmm_mmio_free(vm->vmspace, gpa, len);
552 return (0);
553 }
554
555 /*
556 * Return 'true' if 'gpa' is allocated in the guest address space.
557 *
558 * This function is called in the context of a running vcpu which acts as
559 * an implicit lock on 'vm->mem_maps[]'.
560 */
561 bool
562 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
563 {
564 struct mem_map *mm;
565 int i;
566
567 #ifdef INVARIANTS
568 int hostcpu, state;
569 state = vcpu_get_state(vm, vcpuid, &hostcpu);
570 KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
571 ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
572 #endif
573
574 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
575 mm = &vm->mem_maps[i];
576 if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
577 return (true); /* 'gpa' is sysmem or devmem */
578 }
579
580 if (ppt_is_mmio(vm, gpa))
581 return (true); /* 'gpa' is pci passthru mmio */
582
583 return (false);
584 }
585
586 int
587 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
588 {
589 struct mem_seg *seg;
590 vm_object_t obj;
591
592 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
593 return (EINVAL);
594
595 if (len == 0 || (len & PAGE_MASK))
596 return (EINVAL);
597
598 seg = &vm->mem_segs[ident];
599 if (seg->object != NULL) {
600 if (seg->len == len && seg->sysmem == sysmem)
601 return (EEXIST);
602 else
603 return (EINVAL);
604 }
605
606 obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
607 if (obj == NULL)
608 return (ENOMEM);
609
610 seg->len = len;
611 seg->object = obj;
612 seg->sysmem = sysmem;
613 return (0);
614 }
615
616 int
617 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
618 vm_object_t *objptr)
619 {
620 struct mem_seg *seg;
621
622 if (ident < 0 || ident >= VM_MAX_MEMSEGS)
623 return (EINVAL);
624
625 seg = &vm->mem_segs[ident];
626 if (len)
627 *len = seg->len;
628 if (sysmem)
629 *sysmem = seg->sysmem;
630 if (objptr)
631 *objptr = seg->object;
632 return (0);
633 }
634
635 void
636 vm_free_memseg(struct vm *vm, int ident)
637 {
638 struct mem_seg *seg;
639
640 KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
641 ("%s: invalid memseg ident %d", __func__, ident));
642
643 seg = &vm->mem_segs[ident];
644 if (seg->object != NULL) {
645 vm_object_deallocate(seg->object);
646 bzero(seg, sizeof(struct mem_seg));
647 }
648 }
649
650 int
651 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
652 size_t len, int prot, int flags)
653 {
654 struct mem_seg *seg;
655 struct mem_map *m, *map;
656 vm_ooffset_t last;
657 int i, error;
658
659 if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
660 return (EINVAL);
661
662 if (flags & ~VM_MEMMAP_F_WIRED)
663 return (EINVAL);
664
665 if (segid < 0 || segid >= VM_MAX_MEMSEGS)
666 return (EINVAL);
667
668 seg = &vm->mem_segs[segid];
669 if (seg->object == NULL)
670 return (EINVAL);
671
672 last = first + len;
673 if (first < 0 || first >= last || last > seg->len)
674 return (EINVAL);
675
676 if ((gpa | first | last) & PAGE_MASK)
677 return (EINVAL);
678
679 map = NULL;
680 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
681 m = &vm->mem_maps[i];
682 if (m->len == 0) {
683 map = m;
684 break;
685 }
686 }
687
688 if (map == NULL)
689 return (ENOSPC);
690
691 error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
692 len, 0, VMFS_NO_SPACE, prot, prot, 0);
693 if (error != KERN_SUCCESS)
694 return (EFAULT);
695
696 vm_object_reference(seg->object);
697
698 if (flags & VM_MEMMAP_F_WIRED) {
699 error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
700 VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
701 if (error != KERN_SUCCESS) {
702 vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
703 return (EFAULT);
704 }
705 }
706
707 map->gpa = gpa;
708 map->len = len;
709 map->segoff = first;
710 map->segid = segid;
711 map->prot = prot;
712 map->flags = flags;
713 return (0);
714 }
715
716 int
717 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
718 vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
719 {
720 struct mem_map *mm, *mmnext;
721 int i;
722
723 mmnext = NULL;
724 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
725 mm = &vm->mem_maps[i];
726 if (mm->len == 0 || mm->gpa < *gpa)
727 continue;
728 if (mmnext == NULL || mm->gpa < mmnext->gpa)
729 mmnext = mm;
730 }
731
732 if (mmnext != NULL) {
733 *gpa = mmnext->gpa;
734 if (segid)
735 *segid = mmnext->segid;
736 if (segoff)
737 *segoff = mmnext->segoff;
738 if (len)
739 *len = mmnext->len;
740 if (prot)
741 *prot = mmnext->prot;
742 if (flags)
743 *flags = mmnext->flags;
744 return (0);
745 } else {
746 return (ENOENT);
747 }
748 }
749
750 static void
751 vm_free_memmap(struct vm *vm, int ident)
752 {
753 struct mem_map *mm;
754 int error;
755
756 mm = &vm->mem_maps[ident];
757 if (mm->len) {
758 error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
759 mm->gpa + mm->len);
760 KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
761 __func__, error));
762 bzero(mm, sizeof(struct mem_map));
763 }
764 }
765
766 static __inline bool
767 sysmem_mapping(struct vm *vm, struct mem_map *mm)
768 {
769
770 if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
771 return (true);
772 else
773 return (false);
774 }
775
776 static vm_paddr_t
777 sysmem_maxaddr(struct vm *vm)
778 {
779 struct mem_map *mm;
780 vm_paddr_t maxaddr;
781 int i;
782
783 maxaddr = 0;
784 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
785 mm = &vm->mem_maps[i];
786 if (sysmem_mapping(vm, mm)) {
787 if (maxaddr < mm->gpa + mm->len)
788 maxaddr = mm->gpa + mm->len;
789 }
790 }
791 return (maxaddr);
792 }
793
794 static void
795 vm_iommu_modify(struct vm *vm, boolean_t map)
796 {
797 int i, sz;
798 vm_paddr_t gpa, hpa;
799 struct mem_map *mm;
800 void *vp, *cookie, *host_domain;
801
802 sz = PAGE_SIZE;
803 host_domain = iommu_host_domain();
804
805 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
806 mm = &vm->mem_maps[i];
807 if (!sysmem_mapping(vm, mm))
808 continue;
809
810 if (map) {
811 KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
812 ("iommu map found invalid memmap %#lx/%#lx/%#x",
813 mm->gpa, mm->len, mm->flags));
814 if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
815 continue;
816 mm->flags |= VM_MEMMAP_F_IOMMU;
817 } else {
818 if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
819 continue;
820 mm->flags &= ~VM_MEMMAP_F_IOMMU;
821 KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
822 ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
823 mm->gpa, mm->len, mm->flags));
824 }
825
826 gpa = mm->gpa;
827 while (gpa < mm->gpa + mm->len) {
828 vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
829 &cookie);
830 KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
831 vm_name(vm), gpa));
832
833 vm_gpa_release(cookie);
834
835 hpa = DMAP_TO_PHYS((uintptr_t)vp);
836 if (map) {
837 iommu_create_mapping(vm->iommu, gpa, hpa, sz);
838 iommu_remove_mapping(host_domain, hpa, sz);
839 } else {
840 iommu_remove_mapping(vm->iommu, gpa, sz);
841 iommu_create_mapping(host_domain, hpa, hpa, sz);
842 }
843
844 gpa += PAGE_SIZE;
845 }
846 }
847
848 /*
849 * Invalidate the cached translations associated with the domain
850 * from which pages were removed.
851 */
852 if (map)
853 iommu_invalidate_tlb(host_domain);
854 else
855 iommu_invalidate_tlb(vm->iommu);
856 }
857
858 #define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE)
859 #define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE)
860
861 int
862 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
863 {
864 int error;
865
866 error = ppt_unassign_device(vm, bus, slot, func);
867 if (error)
868 return (error);
869
870 if (ppt_assigned_devices(vm) == 0)
871 vm_iommu_unmap(vm);
872
873 return (0);
874 }
875
876 int
877 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
878 {
879 int error;
880 vm_paddr_t maxaddr;
881
882 /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
883 if (ppt_assigned_devices(vm) == 0) {
884 KASSERT(vm->iommu == NULL,
885 ("vm_assign_pptdev: iommu must be NULL"));
886 maxaddr = sysmem_maxaddr(vm);
887 vm->iommu = iommu_create_domain(maxaddr);
888 if (vm->iommu == NULL)
889 return (ENXIO);
890 vm_iommu_map(vm);
891 }
892
893 error = ppt_assign_device(vm, bus, slot, func);
894 return (error);
895 }
896
897 void *
898 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
899 void **cookie)
900 {
901 int i, count, pageoff;
902 struct mem_map *mm;
903 vm_page_t m;
904 #ifdef INVARIANTS
905 /*
906 * All vcpus are frozen by ioctls that modify the memory map
907 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
908 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
909 */
910 int state;
911 KASSERT(vcpuid >= -1 && vcpuid < VM_MAXCPU, ("%s: invalid vcpuid %d",
912 __func__, vcpuid));
913 for (i = 0; i < VM_MAXCPU; i++) {
914 if (vcpuid != -1 && vcpuid != i)
915 continue;
916 state = vcpu_get_state(vm, i, NULL);
917 KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
918 __func__, state));
919 }
920 #endif
921 pageoff = gpa & PAGE_MASK;
922 if (len > PAGE_SIZE - pageoff)
923 panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
924
925 count = 0;
926 for (i = 0; i < VM_MAX_MEMMAPS; i++) {
927 mm = &vm->mem_maps[i];
928 if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
929 gpa < mm->gpa + mm->len) {
930 count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
931 trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
932 break;
933 }
934 }
935
936 if (count == 1) {
937 *cookie = m;
938 return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
939 } else {
940 *cookie = NULL;
941 return (NULL);
942 }
943 }
944
945 void
946 vm_gpa_release(void *cookie)
947 {
948 vm_page_t m = cookie;
949
950 vm_page_lock(m);
951 vm_page_unhold(m);
952 vm_page_unlock(m);
953 }
954
955 int
956 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
957 {
958
959 if (vcpu < 0 || vcpu >= VM_MAXCPU)
960 return (EINVAL);
961
962 if (reg >= VM_REG_LAST)
963 return (EINVAL);
964
965 return (VMGETREG(vm->cookie, vcpu, reg, retval));
966 }
967
968 int
969 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
970 {
971 struct vcpu *vcpu;
972 int error;
973
974 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
975 return (EINVAL);
976
977 if (reg >= VM_REG_LAST)
978 return (EINVAL);
979
980 error = VMSETREG(vm->cookie, vcpuid, reg, val);
981 if (error || reg != VM_REG_GUEST_RIP)
982 return (error);
983
984 /* Set 'nextrip' to match the value of %rip */
985 VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
986 vcpu = &vm->vcpu[vcpuid];
987 vcpu->nextrip = val;
988 return (0);
989 }
990
991 static boolean_t
992 is_descriptor_table(int reg)
993 {
994
995 switch (reg) {
996 case VM_REG_GUEST_IDTR:
997 case VM_REG_GUEST_GDTR:
998 return (TRUE);
999 default:
1000 return (FALSE);
1001 }
1002 }
1003
1004 static boolean_t
1005 is_segment_register(int reg)
1006 {
1007
1008 switch (reg) {
1009 case VM_REG_GUEST_ES:
1010 case VM_REG_GUEST_CS:
1011 case VM_REG_GUEST_SS:
1012 case VM_REG_GUEST_DS:
1013 case VM_REG_GUEST_FS:
1014 case VM_REG_GUEST_GS:
1015 case VM_REG_GUEST_TR:
1016 case VM_REG_GUEST_LDTR:
1017 return (TRUE);
1018 default:
1019 return (FALSE);
1020 }
1021 }
1022
1023 int
1024 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
1025 struct seg_desc *desc)
1026 {
1027
1028 if (vcpu < 0 || vcpu >= VM_MAXCPU)
1029 return (EINVAL);
1030
1031 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1032 return (EINVAL);
1033
1034 return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1035 }
1036
1037 int
1038 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
1039 struct seg_desc *desc)
1040 {
1041 if (vcpu < 0 || vcpu >= VM_MAXCPU)
1042 return (EINVAL);
1043
1044 if (!is_segment_register(reg) && !is_descriptor_table(reg))
1045 return (EINVAL);
1046
1047 return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1048 }
1049
1050 static void
1051 restore_guest_fpustate(struct vcpu *vcpu)
1052 {
1053
1054 /* flush host state to the pcb */
1055 fpuexit(curthread);
1056
1057 /* restore guest FPU state */
1058 fpu_stop_emulating();
1059 fpurestore(vcpu->guestfpu);
1060
1061 /* restore guest XCR0 if XSAVE is enabled in the host */
1062 if (rcr4() & CR4_XSAVE)
1063 load_xcr(0, vcpu->guest_xcr0);
1064
1065 /*
1066 * The FPU is now "dirty" with the guest's state so turn on emulation
1067 * to trap any access to the FPU by the host.
1068 */
1069 fpu_start_emulating();
1070 }
1071
1072 static void
1073 save_guest_fpustate(struct vcpu *vcpu)
1074 {
1075
1076 if ((rcr0() & CR0_TS) == 0)
1077 panic("fpu emulation not enabled in host!");
1078
1079 /* save guest XCR0 and restore host XCR0 */
1080 if (rcr4() & CR4_XSAVE) {
1081 vcpu->guest_xcr0 = rxcr(0);
1082 load_xcr(0, vmm_get_host_xcr0());
1083 }
1084
1085 /* save guest FPU state */
1086 fpu_stop_emulating();
1087 fpusave(vcpu->guestfpu);
1088 fpu_start_emulating();
1089 }
1090
1091 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1092
1093 static int
1094 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1095 bool from_idle)
1096 {
1097 struct vcpu *vcpu;
1098 int error;
1099
1100 vcpu = &vm->vcpu[vcpuid];
1101 vcpu_assert_locked(vcpu);
1102
1103 /*
1104 * State transitions from the vmmdev_ioctl() must always begin from
1105 * the VCPU_IDLE state. This guarantees that there is only a single
1106 * ioctl() operating on a vcpu at any point.
1107 */
1108 if (from_idle) {
1109 while (vcpu->state != VCPU_IDLE) {
1110 vcpu->reqidle = 1;
1111 vcpu_notify_event_locked(vcpu, false);
1112 VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1113 "idle requested", vcpu_state2str(vcpu->state));
1114 msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1115 }
1116 } else {
1117 KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1118 "vcpu idle state"));
1119 }
1120
1121 if (vcpu->state == VCPU_RUNNING) {
1122 KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1123 "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1124 } else {
1125 KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1126 "vcpu that is not running", vcpu->hostcpu));
1127 }
1128
1129 /*
1130 * The following state transitions are allowed:
1131 * IDLE -> FROZEN -> IDLE
1132 * FROZEN -> RUNNING -> FROZEN
1133 * FROZEN -> SLEEPING -> FROZEN
1134 */
1135 switch (vcpu->state) {
1136 case VCPU_IDLE:
1137 case VCPU_RUNNING:
1138 case VCPU_SLEEPING:
1139 error = (newstate != VCPU_FROZEN);
1140 break;
1141 case VCPU_FROZEN:
1142 error = (newstate == VCPU_FROZEN);
1143 break;
1144 default:
1145 error = 1;
1146 break;
1147 }
1148
1149 if (error)
1150 return (EBUSY);
1151
1152 VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1153 vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1154
1155 vcpu->state = newstate;
1156 if (newstate == VCPU_RUNNING)
1157 vcpu->hostcpu = curcpu;
1158 else
1159 vcpu->hostcpu = NOCPU;
1160
1161 if (newstate == VCPU_IDLE)
1162 wakeup(&vcpu->state);
1163
1164 return (0);
1165 }
1166
1167 static void
1168 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1169 {
1170 int error;
1171
1172 if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1173 panic("Error %d setting state to %d\n", error, newstate);
1174 }
1175
1176 static void
1177 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1178 {
1179 int error;
1180
1181 if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1182 panic("Error %d setting state to %d", error, newstate);
1183 }
1184
1185 static void
1186 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1187 {
1188
1189 KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1190
1191 /*
1192 * Update 'rendezvous_func' and execute a write memory barrier to
1193 * ensure that it is visible across all host cpus. This is not needed
1194 * for correctness but it does ensure that all the vcpus will notice
1195 * that the rendezvous is requested immediately.
1196 */
1197 vm->rendezvous_func = func;
1198 wmb();
1199 }
1200
1201 #define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
1202 do { \
1203 if (vcpuid >= 0) \
1204 VCPU_CTR0(vm, vcpuid, fmt); \
1205 else \
1206 VM_CTR0(vm, fmt); \
1207 } while (0)
1208
1209 static void
1210 vm_handle_rendezvous(struct vm *vm, int vcpuid)
1211 {
1212
1213 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
1214 ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1215
1216 mtx_lock(&vm->rendezvous_mtx);
1217 while (vm->rendezvous_func != NULL) {
1218 /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1219 CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1220
1221 if (vcpuid != -1 &&
1222 CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1223 !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1224 VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1225 (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1226 CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1227 }
1228 if (CPU_CMP(&vm->rendezvous_req_cpus,
1229 &vm->rendezvous_done_cpus) == 0) {
1230 VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1231 vm_set_rendezvous_func(vm, NULL);
1232 wakeup(&vm->rendezvous_func);
1233 break;
1234 }
1235 RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1236 mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1237 "vmrndv", 0);
1238 }
1239 mtx_unlock(&vm->rendezvous_mtx);
1240 }
1241
1242 /*
1243 * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1244 */
1245 static int
1246 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1247 {
1248 struct vcpu *vcpu;
1249 const char *wmesg;
1250 int t, vcpu_halted, vm_halted;
1251
1252 KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1253
1254 vcpu = &vm->vcpu[vcpuid];
1255 vcpu_halted = 0;
1256 vm_halted = 0;
1257
1258 vcpu_lock(vcpu);
1259 while (1) {
1260 /*
1261 * Do a final check for pending NMI or interrupts before
1262 * really putting this thread to sleep. Also check for
1263 * software events that would cause this vcpu to wakeup.
1264 *
1265 * These interrupts/events could have happened after the
1266 * vcpu returned from VMRUN() and before it acquired the
1267 * vcpu lock above.
1268 */
1269 if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
1270 break;
1271 if (vm_nmi_pending(vm, vcpuid))
1272 break;
1273 if (!intr_disabled) {
1274 if (vm_extint_pending(vm, vcpuid) ||
1275 vlapic_pending_intr(vcpu->vlapic, NULL)) {
1276 break;
1277 }
1278 }
1279
1280 /* Don't go to sleep if the vcpu thread needs to yield */
1281 if (vcpu_should_yield(vm, vcpuid))
1282 break;
1283
1284 /*
1285 * Some Linux guests implement "halt" by having all vcpus
1286 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1287 * track of the vcpus that have entered this state. When all
1288 * vcpus enter the halted state the virtual machine is halted.
1289 */
1290 if (intr_disabled) {
1291 wmesg = "vmhalt";
1292 VCPU_CTR0(vm, vcpuid, "Halted");
1293 if (!vcpu_halted && halt_detection_enabled) {
1294 vcpu_halted = 1;
1295 CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1296 }
1297 if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1298 vm_halted = 1;
1299 break;
1300 }
1301 } else {
1302 wmesg = "vmidle";
1303 }
1304
1305 t = ticks;
1306 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1307 /*
1308 * XXX msleep_spin() cannot be interrupted by signals so
1309 * wake up periodically to check pending signals.
1310 */
1311 msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1312 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1313 vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1314 }
1315
1316 if (vcpu_halted)
1317 CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1318
1319 vcpu_unlock(vcpu);
1320
1321 if (vm_halted)
1322 vm_suspend(vm, VM_SUSPEND_HALT);
1323
1324 return (0);
1325 }
1326
1327 static int
1328 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1329 {
1330 int rv, ftype;
1331 struct vm_map *map;
1332 struct vcpu *vcpu;
1333 struct vm_exit *vme;
1334
1335 vcpu = &vm->vcpu[vcpuid];
1336 vme = &vcpu->exitinfo;
1337
1338 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1339 __func__, vme->inst_length));
1340
1341 ftype = vme->u.paging.fault_type;
1342 KASSERT(ftype == VM_PROT_READ ||
1343 ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1344 ("vm_handle_paging: invalid fault_type %d", ftype));
1345
1346 if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1347 rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1348 vme->u.paging.gpa, ftype);
1349 if (rv == 0) {
1350 VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
1351 ftype == VM_PROT_READ ? "accessed" : "dirty",
1352 vme->u.paging.gpa);
1353 goto done;
1354 }
1355 }
1356
1357 map = &vm->vmspace->vm_map;
1358 rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1359
1360 VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1361 "ftype = %d", rv, vme->u.paging.gpa, ftype);
1362
1363 if (rv != KERN_SUCCESS)
1364 return (EFAULT);
1365 done:
1366 return (0);
1367 }
1368
1369 static int
1370 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1371 {
1372 struct vie *vie;
1373 struct vcpu *vcpu;
1374 struct vm_exit *vme;
1375 uint64_t gla, gpa, cs_base;
1376 struct vm_guest_paging *paging;
1377 mem_region_read_t mread;
1378 mem_region_write_t mwrite;
1379 enum vm_cpu_mode cpu_mode;
1380 int cs_d, error, fault;
1381
1382 vcpu = &vm->vcpu[vcpuid];
1383 vme = &vcpu->exitinfo;
1384
1385 KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1386 __func__, vme->inst_length));
1387
1388 gla = vme->u.inst_emul.gla;
1389 gpa = vme->u.inst_emul.gpa;
1390 cs_base = vme->u.inst_emul.cs_base;
1391 cs_d = vme->u.inst_emul.cs_d;
1392 vie = &vme->u.inst_emul.vie;
1393 paging = &vme->u.inst_emul.paging;
1394 cpu_mode = paging->cpu_mode;
1395
1396 VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
1397
1398 /* Fetch, decode and emulate the faulting instruction */
1399 if (vie->num_valid == 0) {
1400 error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
1401 cs_base, VIE_INST_SIZE, vie, &fault);
1402 } else {
1403 /*
1404 * The instruction bytes have already been copied into 'vie'
1405 */
1406 error = fault = 0;
1407 }
1408 if (error || fault)
1409 return (error);
1410
1411 if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
1412 VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
1413 vme->rip + cs_base);
1414 *retu = true; /* dump instruction bytes in userspace */
1415 return (0);
1416 }
1417
1418 /*
1419 * Update 'nextrip' based on the length of the emulated instruction.
1420 */
1421 vme->inst_length = vie->num_processed;
1422 vcpu->nextrip += vie->num_processed;
1423 VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
1424 "decoding", vcpu->nextrip);
1425
1426 /* return to userland unless this is an in-kernel emulated device */
1427 if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1428 mread = lapic_mmio_read;
1429 mwrite = lapic_mmio_write;
1430 } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1431 mread = vioapic_mmio_read;
1432 mwrite = vioapic_mmio_write;
1433 } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1434 mread = vhpet_mmio_read;
1435 mwrite = vhpet_mmio_write;
1436 } else {
1437 *retu = true;
1438 return (0);
1439 }
1440
1441 error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1442 mread, mwrite, retu);
1443
1444 return (error);
1445 }
1446
1447 static int
1448 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1449 {
1450 int i, done;
1451 struct vcpu *vcpu;
1452
1453 done = 0;
1454 vcpu = &vm->vcpu[vcpuid];
1455
1456 CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1457
1458 /*
1459 * Wait until all 'active_cpus' have suspended themselves.
1460 *
1461 * Since a VM may be suspended at any time including when one or
1462 * more vcpus are doing a rendezvous we need to call the rendezvous
1463 * handler while we are waiting to prevent a deadlock.
1464 */
1465 vcpu_lock(vcpu);
1466 while (1) {
1467 if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1468 VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1469 break;
1470 }
1471
1472 if (vm->rendezvous_func == NULL) {
1473 VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1474 vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1475 msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1476 vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1477 } else {
1478 VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1479 vcpu_unlock(vcpu);
1480 vm_handle_rendezvous(vm, vcpuid);
1481 vcpu_lock(vcpu);
1482 }
1483 }
1484 vcpu_unlock(vcpu);
1485
1486 /*
1487 * Wakeup the other sleeping vcpus and return to userspace.
1488 */
1489 for (i = 0; i < VM_MAXCPU; i++) {
1490 if (CPU_ISSET(i, &vm->suspended_cpus)) {
1491 vcpu_notify_event(vm, i, false);
1492 }
1493 }
1494
1495 *retu = true;
1496 return (0);
1497 }
1498
1499 static int
1500 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
1501 {
1502 struct vcpu *vcpu = &vm->vcpu[vcpuid];
1503
1504 vcpu_lock(vcpu);
1505 KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1506 vcpu->reqidle = 0;
1507 vcpu_unlock(vcpu);
1508 *retu = true;
1509 return (0);
1510 }
1511
1512 int
1513 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1514 {
1515 int i;
1516
1517 if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1518 return (EINVAL);
1519
1520 if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1521 VM_CTR2(vm, "virtual machine already suspended %d/%d",
1522 vm->suspend, how);
1523 return (EALREADY);
1524 }
1525
1526 VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1527
1528 /*
1529 * Notify all active vcpus that they are now suspended.
1530 */
1531 for (i = 0; i < VM_MAXCPU; i++) {
1532 if (CPU_ISSET(i, &vm->active_cpus))
1533 vcpu_notify_event(vm, i, false);
1534 }
1535
1536 return (0);
1537 }
1538
1539 void
1540 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1541 {
1542 struct vm_exit *vmexit;
1543
1544 KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1545 ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1546
1547 vmexit = vm_exitinfo(vm, vcpuid);
1548 vmexit->rip = rip;
1549 vmexit->inst_length = 0;
1550 vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1551 vmexit->u.suspended.how = vm->suspend;
1552 }
1553
1554 void
1555 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1556 {
1557 struct vm_exit *vmexit;
1558
1559 KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1560
1561 vmexit = vm_exitinfo(vm, vcpuid);
1562 vmexit->rip = rip;
1563 vmexit->inst_length = 0;
1564 vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1565 vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1566 }
1567
1568 void
1569 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1570 {
1571 struct vm_exit *vmexit;
1572
1573 vmexit = vm_exitinfo(vm, vcpuid);
1574 vmexit->rip = rip;
1575 vmexit->inst_length = 0;
1576 vmexit->exitcode = VM_EXITCODE_REQIDLE;
1577 vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1578 }
1579
1580 void
1581 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1582 {
1583 struct vm_exit *vmexit;
1584
1585 vmexit = vm_exitinfo(vm, vcpuid);
1586 vmexit->rip = rip;
1587 vmexit->inst_length = 0;
1588 vmexit->exitcode = VM_EXITCODE_BOGUS;
1589 vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1590 }
1591
1592 int
1593 vm_run(struct vm *vm, struct vm_run *vmrun)
1594 {
1595 struct vm_eventinfo evinfo;
1596 int error, vcpuid;
1597 struct vcpu *vcpu;
1598 struct pcb *pcb;
1599 uint64_t tscval;
1600 struct vm_exit *vme;
1601 bool retu, intr_disabled;
1602 pmap_t pmap;
1603
1604 vcpuid = vmrun->cpuid;
1605
1606 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1607 return (EINVAL);
1608
1609 if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1610 return (EINVAL);
1611
1612 if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1613 return (EINVAL);
1614
1615 pmap = vmspace_pmap(vm->vmspace);
1616 vcpu = &vm->vcpu[vcpuid];
1617 vme = &vcpu->exitinfo;
1618 evinfo.rptr = &vm->rendezvous_func;
1619 evinfo.sptr = &vm->suspend;
1620 evinfo.iptr = &vcpu->reqidle;
1621 restart:
1622 critical_enter();
1623
1624 KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1625 ("vm_run: absurd pm_active"));
1626
1627 tscval = rdtsc();
1628
1629 pcb = PCPU_GET(curpcb);
1630 set_pcb_flags(pcb, PCB_FULL_IRET);
1631
1632 restore_guest_fpustate(vcpu);
1633
1634 vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1635 error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
1636 vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1637
1638 save_guest_fpustate(vcpu);
1639
1640 vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1641
1642 critical_exit();
1643
1644 if (error == 0) {
1645 retu = false;
1646 vcpu->nextrip = vme->rip + vme->inst_length;
1647 switch (vme->exitcode) {
1648 case VM_EXITCODE_REQIDLE:
1649 error = vm_handle_reqidle(vm, vcpuid, &retu);
1650 break;
1651 case VM_EXITCODE_SUSPENDED:
1652 error = vm_handle_suspend(vm, vcpuid, &retu);
1653 break;
1654 case VM_EXITCODE_IOAPIC_EOI:
1655 vioapic_process_eoi(vm, vcpuid,
1656 vme->u.ioapic_eoi.vector);
1657 break;
1658 case VM_EXITCODE_RENDEZVOUS:
1659 vm_handle_rendezvous(vm, vcpuid);
1660 error = 0;
1661 break;
1662 case VM_EXITCODE_HLT:
1663 intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1664 error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1665 break;
1666 case VM_EXITCODE_PAGING:
1667 error = vm_handle_paging(vm, vcpuid, &retu);
1668 break;
1669 case VM_EXITCODE_INST_EMUL:
1670 error = vm_handle_inst_emul(vm, vcpuid, &retu);
1671 break;
1672 case VM_EXITCODE_INOUT:
1673 case VM_EXITCODE_INOUT_STR:
1674 error = vm_handle_inout(vm, vcpuid, vme, &retu);
1675 break;
1676 case VM_EXITCODE_MONITOR:
1677 case VM_EXITCODE_MWAIT:
1678 vm_inject_ud(vm, vcpuid);
1679 break;
1680 default:
1681 retu = true; /* handled in userland */
1682 break;
1683 }
1684 }
1685
1686 if (error == 0 && retu == false)
1687 goto restart;
1688
1689 VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
1690
1691 /* copy the exit information */
1692 bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1693 return (error);
1694 }
1695
1696 int
1697 vm_restart_instruction(void *arg, int vcpuid)
1698 {
1699 struct vm *vm;
1700 struct vcpu *vcpu;
1701 enum vcpu_state state;
1702 uint64_t rip;
1703 int error;
1704
1705 vm = arg;
1706 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1707 return (EINVAL);
1708
1709 vcpu = &vm->vcpu[vcpuid];
1710 state = vcpu_get_state(vm, vcpuid, NULL);
1711 if (state == VCPU_RUNNING) {
1712 /*
1713 * When a vcpu is "running" the next instruction is determined
1714 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
1715 * Thus setting 'inst_length' to zero will cause the current
1716 * instruction to be restarted.
1717 */
1718 vcpu->exitinfo.inst_length = 0;
1719 VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
1720 "setting inst_length to zero", vcpu->exitinfo.rip);
1721 } else if (state == VCPU_FROZEN) {
1722 /*
1723 * When a vcpu is "frozen" it is outside the critical section
1724 * around VMRUN() and 'nextrip' points to the next instruction.
1725 * Thus instruction restart is achieved by setting 'nextrip'
1726 * to the vcpu's %rip.
1727 */
1728 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
1729 KASSERT(!error, ("%s: error %d getting rip", __func__, error));
1730 VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
1731 "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
1732 vcpu->nextrip = rip;
1733 } else {
1734 panic("%s: invalid state %d", __func__, state);
1735 }
1736 return (0);
1737 }
1738
1739 int
1740 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1741 {
1742 struct vcpu *vcpu;
1743 int type, vector;
1744
1745 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1746 return (EINVAL);
1747
1748 vcpu = &vm->vcpu[vcpuid];
1749
1750 if (info & VM_INTINFO_VALID) {
1751 type = info & VM_INTINFO_TYPE;
1752 vector = info & 0xff;
1753 if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1754 return (EINVAL);
1755 if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1756 return (EINVAL);
1757 if (info & VM_INTINFO_RSVD)
1758 return (EINVAL);
1759 } else {
1760 info = 0;
1761 }
1762 VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1763 vcpu->exitintinfo = info;
1764 return (0);
1765 }
1766
1767 enum exc_class {
1768 EXC_BENIGN,
1769 EXC_CONTRIBUTORY,
1770 EXC_PAGEFAULT
1771 };
1772
1773 #define IDT_VE 20 /* Virtualization Exception (Intel specific) */
1774
1775 static enum exc_class
1776 exception_class(uint64_t info)
1777 {
1778 int type, vector;
1779
1780 KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1781 type = info & VM_INTINFO_TYPE;
1782 vector = info & 0xff;
1783
1784 /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1785 switch (type) {
1786 case VM_INTINFO_HWINTR:
1787 case VM_INTINFO_SWINTR:
1788 case VM_INTINFO_NMI:
1789 return (EXC_BENIGN);
1790 default:
1791 /*
1792 * Hardware exception.
1793 *
1794 * SVM and VT-x use identical type values to represent NMI,
1795 * hardware interrupt and software interrupt.
1796 *
1797 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1798 * for exceptions except #BP and #OF. #BP and #OF use a type
1799 * value of '5' or '6'. Therefore we don't check for explicit
1800 * values of 'type' to classify 'intinfo' into a hardware
1801 * exception.
1802 */
1803 break;
1804 }
1805
1806 switch (vector) {
1807 case IDT_PF:
1808 case IDT_VE:
1809 return (EXC_PAGEFAULT);
1810 case IDT_DE:
1811 case IDT_TS:
1812 case IDT_NP:
1813 case IDT_SS:
1814 case IDT_GP:
1815 return (EXC_CONTRIBUTORY);
1816 default:
1817 return (EXC_BENIGN);
1818 }
1819 }
1820
1821 static int
1822 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1823 uint64_t *retinfo)
1824 {
1825 enum exc_class exc1, exc2;
1826 int type1, vector1;
1827
1828 KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1829 KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1830
1831 /*
1832 * If an exception occurs while attempting to call the double-fault
1833 * handler the processor enters shutdown mode (aka triple fault).
1834 */
1835 type1 = info1 & VM_INTINFO_TYPE;
1836 vector1 = info1 & 0xff;
1837 if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1838 VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1839 info1, info2);
1840 vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1841 *retinfo = 0;
1842 return (0);
1843 }
1844
1845 /*
1846 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1847 */
1848 exc1 = exception_class(info1);
1849 exc2 = exception_class(info2);
1850 if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1851 (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1852 /* Convert nested fault into a double fault. */
1853 *retinfo = IDT_DF;
1854 *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1855 *retinfo |= VM_INTINFO_DEL_ERRCODE;
1856 } else {
1857 /* Handle exceptions serially */
1858 *retinfo = info2;
1859 }
1860 return (1);
1861 }
1862
1863 static uint64_t
1864 vcpu_exception_intinfo(struct vcpu *vcpu)
1865 {
1866 uint64_t info = 0;
1867
1868 if (vcpu->exception_pending) {
1869 info = vcpu->exc_vector & 0xff;
1870 info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1871 if (vcpu->exc_errcode_valid) {
1872 info |= VM_INTINFO_DEL_ERRCODE;
1873 info |= (uint64_t)vcpu->exc_errcode << 32;
1874 }
1875 }
1876 return (info);
1877 }
1878
1879 int
1880 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1881 {
1882 struct vcpu *vcpu;
1883 uint64_t info1, info2;
1884 int valid;
1885
1886 KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
1887
1888 vcpu = &vm->vcpu[vcpuid];
1889
1890 info1 = vcpu->exitintinfo;
1891 vcpu->exitintinfo = 0;
1892
1893 info2 = 0;
1894 if (vcpu->exception_pending) {
1895 info2 = vcpu_exception_intinfo(vcpu);
1896 vcpu->exception_pending = 0;
1897 VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1898 vcpu->exc_vector, info2);
1899 }
1900
1901 if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1902 valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1903 } else if (info1 & VM_INTINFO_VALID) {
1904 *retinfo = info1;
1905 valid = 1;
1906 } else if (info2 & VM_INTINFO_VALID) {
1907 *retinfo = info2;
1908 valid = 1;
1909 } else {
1910 valid = 0;
1911 }
1912
1913 if (valid) {
1914 VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1915 "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1916 }
1917
1918 return (valid);
1919 }
1920
1921 int
1922 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1923 {
1924 struct vcpu *vcpu;
1925
1926 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1927 return (EINVAL);
1928
1929 vcpu = &vm->vcpu[vcpuid];
1930 *info1 = vcpu->exitintinfo;
1931 *info2 = vcpu_exception_intinfo(vcpu);
1932 return (0);
1933 }
1934
1935 int
1936 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
1937 uint32_t errcode, int restart_instruction)
1938 {
1939 struct vcpu *vcpu;
1940 uint64_t regval;
1941 int error;
1942
1943 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
1944 return (EINVAL);
1945
1946 if (vector < 0 || vector >= 32)
1947 return (EINVAL);
1948
1949 /*
1950 * A double fault exception should never be injected directly into
1951 * the guest. It is a derived exception that results from specific
1952 * combinations of nested faults.
1953 */
1954 if (vector == IDT_DF)
1955 return (EINVAL);
1956
1957 vcpu = &vm->vcpu[vcpuid];
1958
1959 if (vcpu->exception_pending) {
1960 VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
1961 "pending exception %d", vector, vcpu->exc_vector);
1962 return (EBUSY);
1963 }
1964
1965 if (errcode_valid) {
1966 /*
1967 * Exceptions don't deliver an error code in real mode.
1968 */
1969 error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val);
1970 KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
1971 if (!(regval & CR0_PE))
1972 errcode_valid = 0;
1973 }
1974
1975 /*
1976 * From section 26.6.1 "Interruptibility State" in Intel SDM:
1977 *
1978 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
1979 * one instruction or incurs an exception.
1980 */
1981 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
1982 KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
1983 __func__, error));
1984
1985 if (restart_instruction)
1986 vm_restart_instruction(vm, vcpuid);
1987
1988 vcpu->exception_pending = 1;
1989 vcpu->exc_vector = vector;
1990 vcpu->exc_errcode = errcode;
1991 vcpu->exc_errcode_valid = errcode_valid;
1992 VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
1993 return (0);
1994 }
1995
1996 void
1997 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
1998 int errcode)
1999 {
2000 struct vm *vm;
2001 int error, restart_instruction;
2002
2003 vm = vmarg;
2004 restart_instruction = 1;
2005
2006 error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2007 errcode, restart_instruction);
2008 KASSERT(error == 0, ("vm_inject_exception error %d", error));
2009 }
2010
2011 void
2012 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
2013 {
2014 struct vm *vm;
2015 int error;
2016
2017 vm = vmarg;
2018 VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
2019 error_code, cr2);
2020
2021 error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2022 KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2023
2024 vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2025 }
2026
2027 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2028
2029 int
2030 vm_inject_nmi(struct vm *vm, int vcpuid)
2031 {
2032 struct vcpu *vcpu;
2033
2034 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2035 return (EINVAL);
2036
2037 vcpu = &vm->vcpu[vcpuid];
2038
2039 vcpu->nmi_pending = 1;
2040 vcpu_notify_event(vm, vcpuid, false);
2041 return (0);
2042 }
2043
2044 int
2045 vm_nmi_pending(struct vm *vm, int vcpuid)
2046 {
2047 struct vcpu *vcpu;
2048
2049 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2050 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2051
2052 vcpu = &vm->vcpu[vcpuid];
2053
2054 return (vcpu->nmi_pending);
2055 }
2056
2057 void
2058 vm_nmi_clear(struct vm *vm, int vcpuid)
2059 {
2060 struct vcpu *vcpu;
2061
2062 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2063 panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2064
2065 vcpu = &vm->vcpu[vcpuid];
2066
2067 if (vcpu->nmi_pending == 0)
2068 panic("vm_nmi_clear: inconsistent nmi_pending state");
2069
2070 vcpu->nmi_pending = 0;
2071 vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2072 }
2073
2074 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2075
2076 int
2077 vm_inject_extint(struct vm *vm, int vcpuid)
2078 {
2079 struct vcpu *vcpu;
2080
2081 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2082 return (EINVAL);
2083
2084 vcpu = &vm->vcpu[vcpuid];
2085
2086 vcpu->extint_pending = 1;
2087 vcpu_notify_event(vm, vcpuid, false);
2088 return (0);
2089 }
2090
2091 int
2092 vm_extint_pending(struct vm *vm, int vcpuid)
2093 {
2094 struct vcpu *vcpu;
2095
2096 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2097 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2098
2099 vcpu = &vm->vcpu[vcpuid];
2100
2101 return (vcpu->extint_pending);
2102 }
2103
2104 void
2105 vm_extint_clear(struct vm *vm, int vcpuid)
2106 {
2107 struct vcpu *vcpu;
2108
2109 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2110 panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2111
2112 vcpu = &vm->vcpu[vcpuid];
2113
2114 if (vcpu->extint_pending == 0)
2115 panic("vm_extint_clear: inconsistent extint_pending state");
2116
2117 vcpu->extint_pending = 0;
2118 vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2119 }
2120
2121 int
2122 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2123 {
2124 if (vcpu < 0 || vcpu >= VM_MAXCPU)
2125 return (EINVAL);
2126
2127 if (type < 0 || type >= VM_CAP_MAX)
2128 return (EINVAL);
2129
2130 return (VMGETCAP(vm->cookie, vcpu, type, retval));
2131 }
2132
2133 int
2134 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2135 {
2136 if (vcpu < 0 || vcpu >= VM_MAXCPU)
2137 return (EINVAL);
2138
2139 if (type < 0 || type >= VM_CAP_MAX)
2140 return (EINVAL);
2141
2142 return (VMSETCAP(vm->cookie, vcpu, type, val));
2143 }
2144
2145 struct vlapic *
2146 vm_lapic(struct vm *vm, int cpu)
2147 {
2148 return (vm->vcpu[cpu].vlapic);
2149 }
2150
2151 struct vioapic *
2152 vm_ioapic(struct vm *vm)
2153 {
2154
2155 return (vm->vioapic);
2156 }
2157
2158 struct vhpet *
2159 vm_hpet(struct vm *vm)
2160 {
2161
2162 return (vm->vhpet);
2163 }
2164
2165 boolean_t
2166 vmm_is_pptdev(int bus, int slot, int func)
2167 {
2168 int found, i, n;
2169 int b, s, f;
2170 char *val, *cp, *cp2;
2171
2172 /*
2173 * XXX
2174 * The length of an environment variable is limited to 128 bytes which
2175 * puts an upper limit on the number of passthru devices that may be
2176 * specified using a single environment variable.
2177 *
2178 * Work around this by scanning multiple environment variable
2179 * names instead of a single one - yuck!
2180 */
2181 const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2182
2183 /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2184 found = 0;
2185 for (i = 0; names[i] != NULL && !found; i++) {
2186 cp = val = kern_getenv(names[i]);
2187 while (cp != NULL && *cp != '\0') {
2188 if ((cp2 = strchr(cp, ' ')) != NULL)
2189 *cp2 = '\0';
2190
2191 n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2192 if (n == 3 && bus == b && slot == s && func == f) {
2193 found = 1;
2194 break;
2195 }
2196
2197 if (cp2 != NULL)
2198 *cp2++ = ' ';
2199
2200 cp = cp2;
2201 }
2202 freeenv(val);
2203 }
2204 return (found);
2205 }
2206
2207 void *
2208 vm_iommu_domain(struct vm *vm)
2209 {
2210
2211 return (vm->iommu);
2212 }
2213
2214 int
2215 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2216 bool from_idle)
2217 {
2218 int error;
2219 struct vcpu *vcpu;
2220
2221 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2222 panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2223
2224 vcpu = &vm->vcpu[vcpuid];
2225
2226 vcpu_lock(vcpu);
2227 error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2228 vcpu_unlock(vcpu);
2229
2230 return (error);
2231 }
2232
2233 enum vcpu_state
2234 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2235 {
2236 struct vcpu *vcpu;
2237 enum vcpu_state state;
2238
2239 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2240 panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2241
2242 vcpu = &vm->vcpu[vcpuid];
2243
2244 vcpu_lock(vcpu);
2245 state = vcpu->state;
2246 if (hostcpu != NULL)
2247 *hostcpu = vcpu->hostcpu;
2248 vcpu_unlock(vcpu);
2249
2250 return (state);
2251 }
2252
2253 int
2254 vm_activate_cpu(struct vm *vm, int vcpuid)
2255 {
2256
2257 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2258 return (EINVAL);
2259
2260 if (CPU_ISSET(vcpuid, &vm->active_cpus))
2261 return (EBUSY);
2262
2263 VCPU_CTR0(vm, vcpuid, "activated");
2264 CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2265 return (0);
2266 }
2267
2268 cpuset_t
2269 vm_active_cpus(struct vm *vm)
2270 {
2271
2272 return (vm->active_cpus);
2273 }
2274
2275 cpuset_t
2276 vm_suspended_cpus(struct vm *vm)
2277 {
2278
2279 return (vm->suspended_cpus);
2280 }
2281
2282 void *
2283 vcpu_stats(struct vm *vm, int vcpuid)
2284 {
2285
2286 return (vm->vcpu[vcpuid].stats);
2287 }
2288
2289 int
2290 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2291 {
2292 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2293 return (EINVAL);
2294
2295 *state = vm->vcpu[vcpuid].x2apic_state;
2296
2297 return (0);
2298 }
2299
2300 int
2301 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2302 {
2303 if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
2304 return (EINVAL);
2305
2306 if (state >= X2APIC_STATE_LAST)
2307 return (EINVAL);
2308
2309 vm->vcpu[vcpuid].x2apic_state = state;
2310
2311 vlapic_set_x2apic_state(vm, vcpuid, state);
2312
2313 return (0);
2314 }
2315
2316 /*
2317 * This function is called to ensure that a vcpu "sees" a pending event
2318 * as soon as possible:
2319 * - If the vcpu thread is sleeping then it is woken up.
2320 * - If the vcpu is running on a different host_cpu then an IPI will be directed
2321 * to the host_cpu to cause the vcpu to trap into the hypervisor.
2322 */
2323 static void
2324 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
2325 {
2326 int hostcpu;
2327
2328 hostcpu = vcpu->hostcpu;
2329 if (vcpu->state == VCPU_RUNNING) {
2330 KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2331 if (hostcpu != curcpu) {
2332 if (lapic_intr) {
2333 vlapic_post_intr(vcpu->vlapic, hostcpu,
2334 vmm_ipinum);
2335 } else {
2336 ipi_cpu(hostcpu, vmm_ipinum);
2337 }
2338 } else {
2339 /*
2340 * If the 'vcpu' is running on 'curcpu' then it must
2341 * be sending a notification to itself (e.g. SELF_IPI).
2342 * The pending event will be picked up when the vcpu
2343 * transitions back to guest context.
2344 */
2345 }
2346 } else {
2347 KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2348 "with hostcpu %d", vcpu->state, hostcpu));
2349 if (vcpu->state == VCPU_SLEEPING)
2350 wakeup_one(vcpu);
2351 }
2352 }
2353
2354 void
2355 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2356 {
2357 struct vcpu *vcpu = &vm->vcpu[vcpuid];
2358
2359 vcpu_lock(vcpu);
2360 vcpu_notify_event_locked(vcpu, lapic_intr);
2361 vcpu_unlock(vcpu);
2362 }
2363
2364 struct vmspace *
2365 vm_get_vmspace(struct vm *vm)
2366 {
2367
2368 return (vm->vmspace);
2369 }
2370
2371 int
2372 vm_apicid2vcpuid(struct vm *vm, int apicid)
2373 {
2374 /*
2375 * XXX apic id is assumed to be numerically identical to vcpu id
2376 */
2377 return (apicid);
2378 }
2379
2380 void
2381 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2382 vm_rendezvous_func_t func, void *arg)
2383 {
2384 int i;
2385
2386 /*
2387 * Enforce that this function is called without any locks
2388 */
2389 WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2390 KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < VM_MAXCPU),
2391 ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2392
2393 restart:
2394 mtx_lock(&vm->rendezvous_mtx);
2395 if (vm->rendezvous_func != NULL) {
2396 /*
2397 * If a rendezvous is already in progress then we need to
2398 * call the rendezvous handler in case this 'vcpuid' is one
2399 * of the targets of the rendezvous.
2400 */
2401 RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2402 mtx_unlock(&vm->rendezvous_mtx);
2403 vm_handle_rendezvous(vm, vcpuid);
2404 goto restart;
2405 }
2406 KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2407 "rendezvous is still in progress"));
2408
2409 RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2410 vm->rendezvous_req_cpus = dest;
2411 CPU_ZERO(&vm->rendezvous_done_cpus);
2412 vm->rendezvous_arg = arg;
2413 vm_set_rendezvous_func(vm, func);
2414 mtx_unlock(&vm->rendezvous_mtx);
2415
2416 /*
2417 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2418 * vcpus so they handle the rendezvous as soon as possible.
2419 */
2420 for (i = 0; i < VM_MAXCPU; i++) {
2421 if (CPU_ISSET(i, &dest))
2422 vcpu_notify_event(vm, i, false);
2423 }
2424
2425 vm_handle_rendezvous(vm, vcpuid);
2426 }
2427
2428 struct vatpic *
2429 vm_atpic(struct vm *vm)
2430 {
2431 return (vm->vatpic);
2432 }
2433
2434 struct vatpit *
2435 vm_atpit(struct vm *vm)
2436 {
2437 return (vm->vatpit);
2438 }
2439
2440 struct vpmtmr *
2441 vm_pmtmr(struct vm *vm)
2442 {
2443
2444 return (vm->vpmtmr);
2445 }
2446
2447 struct vrtc *
2448 vm_rtc(struct vm *vm)
2449 {
2450
2451 return (vm->vrtc);
2452 }
2453
2454 enum vm_reg_name
2455 vm_segment_name(int seg)
2456 {
2457 static enum vm_reg_name seg_names[] = {
2458 VM_REG_GUEST_ES,
2459 VM_REG_GUEST_CS,
2460 VM_REG_GUEST_SS,
2461 VM_REG_GUEST_DS,
2462 VM_REG_GUEST_FS,
2463 VM_REG_GUEST_GS
2464 };
2465
2466 KASSERT(seg >= 0 && seg < nitems(seg_names),
2467 ("%s: invalid segment encoding %d", __func__, seg));
2468 return (seg_names[seg]);
2469 }
2470
2471 void
2472 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2473 int num_copyinfo)
2474 {
2475 int idx;
2476
2477 for (idx = 0; idx < num_copyinfo; idx++) {
2478 if (copyinfo[idx].cookie != NULL)
2479 vm_gpa_release(copyinfo[idx].cookie);
2480 }
2481 bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2482 }
2483
2484 int
2485 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2486 uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2487 int num_copyinfo, int *fault)
2488 {
2489 int error, idx, nused;
2490 size_t n, off, remaining;
2491 void *hva, *cookie;
2492 uint64_t gpa;
2493
2494 bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2495
2496 nused = 0;
2497 remaining = len;
2498 while (remaining > 0) {
2499 KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2500 error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
2501 if (error || *fault)
2502 return (error);
2503 off = gpa & PAGE_MASK;
2504 n = min(remaining, PAGE_SIZE - off);
2505 copyinfo[nused].gpa = gpa;
2506 copyinfo[nused].len = n;
2507 remaining -= n;
2508 gla += n;
2509 nused++;
2510 }
2511
2512 for (idx = 0; idx < nused; idx++) {
2513 hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
2514 copyinfo[idx].len, prot, &cookie);
2515 if (hva == NULL)
2516 break;
2517 copyinfo[idx].hva = hva;
2518 copyinfo[idx].cookie = cookie;
2519 }
2520
2521 if (idx != nused) {
2522 vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2523 return (EFAULT);
2524 } else {
2525 *fault = 0;
2526 return (0);
2527 }
2528 }
2529
2530 void
2531 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2532 size_t len)
2533 {
2534 char *dst;
2535 int idx;
2536
2537 dst = kaddr;
2538 idx = 0;
2539 while (len > 0) {
2540 bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2541 len -= copyinfo[idx].len;
2542 dst += copyinfo[idx].len;
2543 idx++;
2544 }
2545 }
2546
2547 void
2548 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2549 struct vm_copyinfo *copyinfo, size_t len)
2550 {
2551 const char *src;
2552 int idx;
2553
2554 src = kaddr;
2555 idx = 0;
2556 while (len > 0) {
2557 bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2558 len -= copyinfo[idx].len;
2559 src += copyinfo[idx].len;
2560 idx++;
2561 }
2562 }
2563
2564 /*
2565 * Return the amount of in-use and wired memory for the VM. Since
2566 * these are global stats, only return the values with for vCPU 0
2567 */
2568 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2569 VMM_STAT_DECLARE(VMM_MEM_WIRED);
2570
2571 static void
2572 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2573 {
2574
2575 if (vcpu == 0) {
2576 vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2577 PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2578 }
2579 }
2580
2581 static void
2582 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2583 {
2584
2585 if (vcpu == 0) {
2586 vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2587 PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2588 }
2589 }
2590
2591 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2592 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
Cache object: 8f24794e72a06874f321ee812dba5e74
|