The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/vmm_instruction_emul.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2012 Sandvine, Inc.
    5  * Copyright (c) 2012 NetApp, Inc.
    6  * All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  * $FreeBSD$
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __FBSDID("$FreeBSD$");
   34 
   35 #ifdef _KERNEL
   36 #include <sys/param.h>
   37 #include <sys/pcpu.h>
   38 #include <sys/systm.h>
   39 #include <sys/proc.h>
   40 
   41 #include <vm/vm.h>
   42 #include <vm/pmap.h>
   43 
   44 #include <machine/vmparam.h>
   45 #include <machine/vmm.h>
   46 #else   /* !_KERNEL */
   47 #include <sys/types.h>
   48 #include <sys/errno.h>
   49 #include <sys/_iovec.h>
   50 
   51 #include <machine/vmm.h>
   52 
   53 #include <err.h>
   54 #include <assert.h>
   55 #include <stdbool.h>
   56 #include <stddef.h>
   57 #include <stdio.h>
   58 #include <string.h>
   59 #include <strings.h>
   60 #include <vmmapi.h>
   61 #define __diagused
   62 #define KASSERT(exp,msg)        assert((exp))
   63 #define panic(...)              errx(4, __VA_ARGS__)
   64 #endif  /* _KERNEL */
   65 
   66 #include <machine/vmm_instruction_emul.h>
   67 #include <x86/psl.h>
   68 #include <x86/specialreg.h>
   69 
   70 /* struct vie_op.op_type */
   71 enum {
   72         VIE_OP_TYPE_NONE = 0,
   73         VIE_OP_TYPE_MOV,
   74         VIE_OP_TYPE_MOVSX,
   75         VIE_OP_TYPE_MOVZX,
   76         VIE_OP_TYPE_AND,
   77         VIE_OP_TYPE_OR,
   78         VIE_OP_TYPE_SUB,
   79         VIE_OP_TYPE_TWO_BYTE,
   80         VIE_OP_TYPE_PUSH,
   81         VIE_OP_TYPE_CMP,
   82         VIE_OP_TYPE_POP,
   83         VIE_OP_TYPE_MOVS,
   84         VIE_OP_TYPE_GROUP1,
   85         VIE_OP_TYPE_STOS,
   86         VIE_OP_TYPE_BITTEST,
   87         VIE_OP_TYPE_TWOB_GRP15,
   88         VIE_OP_TYPE_ADD,
   89         VIE_OP_TYPE_TEST,
   90         VIE_OP_TYPE_BEXTR,
   91         VIE_OP_TYPE_LAST
   92 };
   93 
   94 /* struct vie_op.op_flags */
   95 #define VIE_OP_F_IMM            (1 << 0)  /* 16/32-bit immediate operand */
   96 #define VIE_OP_F_IMM8           (1 << 1)  /* 8-bit immediate operand */
   97 #define VIE_OP_F_MOFFSET        (1 << 2)  /* 16/32/64-bit immediate moffset */
   98 #define VIE_OP_F_NO_MODRM       (1 << 3)
   99 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
  100 
  101 static const struct vie_op three_byte_opcodes_0f38[256] = {
  102         [0xF7] = {
  103                 .op_byte = 0xF7,
  104                 .op_type = VIE_OP_TYPE_BEXTR,
  105         },
  106 };
  107 
  108 static const struct vie_op two_byte_opcodes[256] = {
  109         [0xAE] = {
  110                 .op_byte = 0xAE,
  111                 .op_type = VIE_OP_TYPE_TWOB_GRP15,
  112         },
  113         [0xB6] = {
  114                 .op_byte = 0xB6,
  115                 .op_type = VIE_OP_TYPE_MOVZX,
  116         },
  117         [0xB7] = {
  118                 .op_byte = 0xB7,
  119                 .op_type = VIE_OP_TYPE_MOVZX,
  120         },
  121         [0xBA] = {
  122                 .op_byte = 0xBA,
  123                 .op_type = VIE_OP_TYPE_BITTEST,
  124                 .op_flags = VIE_OP_F_IMM8,
  125         },
  126         [0xBE] = {
  127                 .op_byte = 0xBE,
  128                 .op_type = VIE_OP_TYPE_MOVSX,
  129         },
  130 };
  131 
  132 static const struct vie_op one_byte_opcodes[256] = {
  133         [0x03] = {
  134                 .op_byte = 0x03,
  135                 .op_type = VIE_OP_TYPE_ADD,
  136         },
  137         [0x0F] = {
  138                 .op_byte = 0x0F,
  139                 .op_type = VIE_OP_TYPE_TWO_BYTE
  140         },
  141         [0x0B] = {
  142                 .op_byte = 0x0B,
  143                 .op_type = VIE_OP_TYPE_OR,
  144         },
  145         [0x2B] = {
  146                 .op_byte = 0x2B,
  147                 .op_type = VIE_OP_TYPE_SUB,
  148         },
  149         [0x39] = {
  150                 .op_byte = 0x39,
  151                 .op_type = VIE_OP_TYPE_CMP,
  152         },
  153         [0x3B] = {
  154                 .op_byte = 0x3B,
  155                 .op_type = VIE_OP_TYPE_CMP,
  156         },
  157         [0x88] = {
  158                 .op_byte = 0x88,
  159                 .op_type = VIE_OP_TYPE_MOV,
  160         },
  161         [0x89] = {
  162                 .op_byte = 0x89,
  163                 .op_type = VIE_OP_TYPE_MOV,
  164         },
  165         [0x8A] = {
  166                 .op_byte = 0x8A,
  167                 .op_type = VIE_OP_TYPE_MOV,
  168         },
  169         [0x8B] = {
  170                 .op_byte = 0x8B,
  171                 .op_type = VIE_OP_TYPE_MOV,
  172         },
  173         [0xA1] = {
  174                 .op_byte = 0xA1,
  175                 .op_type = VIE_OP_TYPE_MOV,
  176                 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
  177         },
  178         [0xA3] = {
  179                 .op_byte = 0xA3,
  180                 .op_type = VIE_OP_TYPE_MOV,
  181                 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
  182         },
  183         [0xA4] = {
  184                 .op_byte = 0xA4,
  185                 .op_type = VIE_OP_TYPE_MOVS,
  186                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  187         },
  188         [0xA5] = {
  189                 .op_byte = 0xA5,
  190                 .op_type = VIE_OP_TYPE_MOVS,
  191                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  192         },
  193         [0xAA] = {
  194                 .op_byte = 0xAA,
  195                 .op_type = VIE_OP_TYPE_STOS,
  196                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  197         },
  198         [0xAB] = {
  199                 .op_byte = 0xAB,
  200                 .op_type = VIE_OP_TYPE_STOS,
  201                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  202         },
  203         [0xC6] = {
  204                 /* XXX Group 11 extended opcode - not just MOV */
  205                 .op_byte = 0xC6,
  206                 .op_type = VIE_OP_TYPE_MOV,
  207                 .op_flags = VIE_OP_F_IMM8,
  208         },
  209         [0xC7] = {
  210                 .op_byte = 0xC7,
  211                 .op_type = VIE_OP_TYPE_MOV,
  212                 .op_flags = VIE_OP_F_IMM,
  213         },
  214         [0x23] = {
  215                 .op_byte = 0x23,
  216                 .op_type = VIE_OP_TYPE_AND,
  217         },
  218         [0x80] = {
  219                 /* Group 1 extended opcode */
  220                 .op_byte = 0x80,
  221                 .op_type = VIE_OP_TYPE_GROUP1,
  222                 .op_flags = VIE_OP_F_IMM8,
  223         },
  224         [0x81] = {
  225                 /* Group 1 extended opcode */
  226                 .op_byte = 0x81,
  227                 .op_type = VIE_OP_TYPE_GROUP1,
  228                 .op_flags = VIE_OP_F_IMM,
  229         },
  230         [0x83] = {
  231                 /* Group 1 extended opcode */
  232                 .op_byte = 0x83,
  233                 .op_type = VIE_OP_TYPE_GROUP1,
  234                 .op_flags = VIE_OP_F_IMM8,
  235         },
  236         [0x8F] = {
  237                 /* XXX Group 1A extended opcode - not just POP */
  238                 .op_byte = 0x8F,
  239                 .op_type = VIE_OP_TYPE_POP,
  240         },
  241         [0xF7] = {
  242                 /* XXX Group 3 extended opcode - not just TEST */
  243                 .op_byte = 0xF7,
  244                 .op_type = VIE_OP_TYPE_TEST,
  245                 .op_flags = VIE_OP_F_IMM,
  246         },
  247         [0xFF] = {
  248                 /* XXX Group 5 extended opcode - not just PUSH */
  249                 .op_byte = 0xFF,
  250                 .op_type = VIE_OP_TYPE_PUSH,
  251         }
  252 };
  253 
  254 /* struct vie.mod */
  255 #define VIE_MOD_INDIRECT                0
  256 #define VIE_MOD_INDIRECT_DISP8          1
  257 #define VIE_MOD_INDIRECT_DISP32         2
  258 #define VIE_MOD_DIRECT                  3
  259 
  260 /* struct vie.rm */
  261 #define VIE_RM_SIB                      4
  262 #define VIE_RM_DISP32                   5
  263 
  264 #define GB                              (1024 * 1024 * 1024)
  265 
  266 static enum vm_reg_name gpr_map[16] = {
  267         VM_REG_GUEST_RAX,
  268         VM_REG_GUEST_RCX,
  269         VM_REG_GUEST_RDX,
  270         VM_REG_GUEST_RBX,
  271         VM_REG_GUEST_RSP,
  272         VM_REG_GUEST_RBP,
  273         VM_REG_GUEST_RSI,
  274         VM_REG_GUEST_RDI,
  275         VM_REG_GUEST_R8,
  276         VM_REG_GUEST_R9,
  277         VM_REG_GUEST_R10,
  278         VM_REG_GUEST_R11,
  279         VM_REG_GUEST_R12,
  280         VM_REG_GUEST_R13,
  281         VM_REG_GUEST_R14,
  282         VM_REG_GUEST_R15
  283 };
  284 
  285 static uint64_t size2mask[] = {
  286         [1] = 0xff,
  287         [2] = 0xffff,
  288         [4] = 0xffffffff,
  289         [8] = 0xffffffffffffffff,
  290 };
  291 
  292 static int
  293 vie_read_register(VCPU_DECL, enum vm_reg_name reg, uint64_t *rval)
  294 {
  295         int error;
  296 
  297         error = vm_get_register(VCPU_ARGS, reg, rval);
  298 
  299         return (error);
  300 }
  301 
  302 static void
  303 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
  304 {
  305         *lhbr = 0;
  306         *reg = gpr_map[vie->reg];
  307 
  308         /*
  309          * 64-bit mode imposes limitations on accessing legacy high byte
  310          * registers (lhbr).
  311          *
  312          * The legacy high-byte registers cannot be addressed if the REX
  313          * prefix is present. In this case the values 4, 5, 6 and 7 of the
  314          * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
  315          *
  316          * If the REX prefix is not present then the values 4, 5, 6 and 7
  317          * of the 'ModRM:reg' field address the legacy high-byte registers,
  318          * %ah, %ch, %dh and %bh respectively.
  319          */
  320         if (!vie->rex_present) {
  321                 if (vie->reg & 0x4) {
  322                         *lhbr = 1;
  323                         *reg = gpr_map[vie->reg & 0x3];
  324                 }
  325         }
  326 }
  327 
  328 static int
  329 vie_read_bytereg(VCPU_DECL, struct vie *vie, uint8_t *rval)
  330 {
  331         uint64_t val;
  332         int error, lhbr;
  333         enum vm_reg_name reg;
  334 
  335         vie_calc_bytereg(vie, &reg, &lhbr);
  336         error = vm_get_register(VCPU_ARGS, reg, &val);
  337 
  338         /*
  339          * To obtain the value of a legacy high byte register shift the
  340          * base register right by 8 bits (%ah = %rax >> 8).
  341          */
  342         if (lhbr)
  343                 *rval = val >> 8;
  344         else
  345                 *rval = val;
  346         return (error);
  347 }
  348 
  349 static int
  350 vie_write_bytereg(VCPU_DECL, struct vie *vie, uint8_t byte)
  351 {
  352         uint64_t origval, val, mask;
  353         int error, lhbr;
  354         enum vm_reg_name reg;
  355 
  356         vie_calc_bytereg(vie, &reg, &lhbr);
  357         error = vm_get_register(VCPU_ARGS, reg, &origval);
  358         if (error == 0) {
  359                 val = byte;
  360                 mask = 0xff;
  361                 if (lhbr) {
  362                         /*
  363                          * Shift left by 8 to store 'byte' in a legacy high
  364                          * byte register.
  365                          */
  366                         val <<= 8;
  367                         mask <<= 8;
  368                 }
  369                 val |= origval & ~mask;
  370                 error = vm_set_register(VCPU_ARGS, reg, val);
  371         }
  372         return (error);
  373 }
  374 
  375 int
  376 vie_update_register(VCPU_DECL, enum vm_reg_name reg,
  377                     uint64_t val, int size)
  378 {
  379         int error;
  380         uint64_t origval;
  381 
  382         switch (size) {
  383         case 1:
  384         case 2:
  385                 error = vie_read_register(VCPU_ARGS, reg, &origval);
  386                 if (error)
  387                         return (error);
  388                 val &= size2mask[size];
  389                 val |= origval & ~size2mask[size];
  390                 break;
  391         case 4:
  392                 val &= 0xffffffffUL;
  393                 break;
  394         case 8:
  395                 break;
  396         default:
  397                 return (EINVAL);
  398         }
  399 
  400         error = vm_set_register(VCPU_ARGS, reg, val);
  401         return (error);
  402 }
  403 
  404 #define RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
  405 
  406 /*
  407  * Return the status flags that would result from doing (x - y).
  408  */
  409 #define GETCC(sz)                                                       \
  410 static u_long                                                           \
  411 getcc##sz(uint##sz##_t x, uint##sz##_t y)                               \
  412 {                                                                       \
  413         u_long rflags;                                                  \
  414                                                                         \
  415         __asm __volatile("sub %2,%1; pushfq; popq %0" :                 \
  416             "=r" (rflags), "+r" (x) : "m" (y));                         \
  417         return (rflags);                                                \
  418 } struct __hack
  419 
  420 GETCC(8);
  421 GETCC(16);
  422 GETCC(32);
  423 GETCC(64);
  424 
  425 static u_long
  426 getcc(int opsize, uint64_t x, uint64_t y)
  427 {
  428         KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
  429             ("getcc: invalid operand size %d", opsize));
  430 
  431         if (opsize == 1)
  432                 return (getcc8(x, y));
  433         else if (opsize == 2)
  434                 return (getcc16(x, y));
  435         else if (opsize == 4)
  436                 return (getcc32(x, y));
  437         else
  438                 return (getcc64(x, y));
  439 }
  440 
  441 /*
  442  * Macro creation of functions getaddflags{8,16,32,64}
  443  */
  444 #define GETADDFLAGS(sz)                                                 \
  445 static u_long                                                           \
  446 getaddflags##sz(uint##sz##_t x, uint##sz##_t y)                         \
  447 {                                                                       \
  448         u_long rflags;                                                  \
  449                                                                         \
  450         __asm __volatile("add %2,%1; pushfq; popq %0" :                 \
  451             "=r" (rflags), "+r" (x) : "m" (y));                         \
  452         return (rflags);                                                \
  453 } struct __hack
  454 
  455 GETADDFLAGS(8);
  456 GETADDFLAGS(16);
  457 GETADDFLAGS(32);
  458 GETADDFLAGS(64);
  459 
  460 static u_long
  461 getaddflags(int opsize, uint64_t x, uint64_t y)
  462 {
  463         KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
  464             ("getaddflags: invalid operand size %d", opsize));
  465 
  466         if (opsize == 1)
  467                 return (getaddflags8(x, y));
  468         else if (opsize == 2)
  469                 return (getaddflags16(x, y));
  470         else if (opsize == 4)
  471                 return (getaddflags32(x, y));
  472         else
  473                 return (getaddflags64(x, y));
  474 }
  475 
  476 /*
  477  * Return the status flags that would result from doing (x & y).
  478  */
  479 #define GETANDFLAGS(sz)                                                 \
  480 static u_long                                                           \
  481 getandflags##sz(uint##sz##_t x, uint##sz##_t y)                         \
  482 {                                                                       \
  483         u_long rflags;                                                  \
  484                                                                         \
  485         __asm __volatile("and %2,%1; pushfq; popq %0" :                 \
  486             "=r" (rflags), "+r" (x) : "m" (y));                         \
  487         return (rflags);                                                \
  488 } struct __hack
  489 
  490 GETANDFLAGS(8);
  491 GETANDFLAGS(16);
  492 GETANDFLAGS(32);
  493 GETANDFLAGS(64);
  494 
  495 static u_long
  496 getandflags(int opsize, uint64_t x, uint64_t y)
  497 {
  498         KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
  499             ("getandflags: invalid operand size %d", opsize));
  500 
  501         if (opsize == 1)
  502                 return (getandflags8(x, y));
  503         else if (opsize == 2)
  504                 return (getandflags16(x, y));
  505         else if (opsize == 4)
  506                 return (getandflags32(x, y));
  507         else
  508                 return (getandflags64(x, y));
  509 }
  510 
  511 static int
  512 emulate_mov(VCPU_DECL, uint64_t gpa, struct vie *vie,
  513             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
  514 {
  515         int error, size;
  516         enum vm_reg_name reg;
  517         uint8_t byte;
  518         uint64_t val;
  519 
  520         size = vie->opsize;
  521         error = EINVAL;
  522 
  523         switch (vie->op.op_byte) {
  524         case 0x88:
  525                 /*
  526                  * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
  527                  * 88/r:        mov r/m8, r8
  528                  * REX + 88/r:  mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
  529                  */
  530                 size = 1;       /* override for byte operation */
  531                 error = vie_read_bytereg(VCPU_ARGS, vie, &byte);
  532                 if (error == 0)
  533                         error = memwrite(VCPU_ARGS, gpa, byte, size, arg);
  534                 break;
  535         case 0x89:
  536                 /*
  537                  * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
  538                  * 89/r:        mov r/m16, r16
  539                  * 89/r:        mov r/m32, r32
  540                  * REX.W + 89/r mov r/m64, r64
  541                  */
  542                 reg = gpr_map[vie->reg];
  543                 error = vie_read_register(VCPU_ARGS, reg, &val);
  544                 if (error == 0) {
  545                         val &= size2mask[size];
  546                         error = memwrite(VCPU_ARGS, gpa, val, size, arg);
  547                 }
  548                 break;
  549         case 0x8A:
  550                 /*
  551                  * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
  552                  * 8A/r:        mov r8, r/m8
  553                  * REX + 8A/r:  mov r8, r/m8
  554                  */
  555                 size = 1;       /* override for byte operation */
  556                 error = memread(VCPU_ARGS, gpa, &val, size, arg);
  557                 if (error == 0)
  558                         error = vie_write_bytereg(VCPU_ARGS, vie, val);
  559                 break;
  560         case 0x8B:
  561                 /*
  562                  * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
  563                  * 8B/r:        mov r16, r/m16
  564                  * 8B/r:        mov r32, r/m32
  565                  * REX.W 8B/r:  mov r64, r/m64
  566                  */
  567                 error = memread(VCPU_ARGS, gpa, &val, size, arg);
  568                 if (error == 0) {
  569                         reg = gpr_map[vie->reg];
  570                         error = vie_update_register(VCPU_ARGS, reg, val, size);
  571                 }
  572                 break;
  573         case 0xA1:
  574                 /*
  575                  * MOV from seg:moffset to AX/EAX/RAX
  576                  * A1:          mov AX, moffs16
  577                  * A1:          mov EAX, moffs32
  578                  * REX.W + A1:  mov RAX, moffs64
  579                  */
  580                 error = memread(VCPU_ARGS, gpa, &val, size, arg);
  581                 if (error == 0) {
  582                         reg = VM_REG_GUEST_RAX;
  583                         error = vie_update_register(VCPU_ARGS, reg, val, size);
  584                 }
  585                 break;
  586         case 0xA3:
  587                 /*
  588                  * MOV from AX/EAX/RAX to seg:moffset
  589                  * A3:          mov moffs16, AX
  590                  * A3:          mov moffs32, EAX 
  591                  * REX.W + A3:  mov moffs64, RAX
  592                  */
  593                 error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RAX, &val);
  594                 if (error == 0) {
  595                         val &= size2mask[size];
  596                         error = memwrite(VCPU_ARGS, gpa, val, size, arg);
  597                 }
  598                 break;
  599         case 0xC6:
  600                 /*
  601                  * MOV from imm8 to mem (ModRM:r/m)
  602                  * C6/0         mov r/m8, imm8
  603                  * REX + C6/0   mov r/m8, imm8
  604                  */
  605                 size = 1;       /* override for byte operation */
  606                 error = memwrite(VCPU_ARGS, gpa, vie->immediate, size, arg);
  607                 break;
  608         case 0xC7:
  609                 /*
  610                  * MOV from imm16/imm32 to mem (ModRM:r/m)
  611                  * C7/0         mov r/m16, imm16
  612                  * C7/0         mov r/m32, imm32
  613                  * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
  614                  */
  615                 val = vie->immediate & size2mask[size];
  616                 error = memwrite(VCPU_ARGS, gpa, val, size, arg);
  617                 break;
  618         default:
  619                 break;
  620         }
  621 
  622         return (error);
  623 }
  624 
  625 static int
  626 emulate_movx(VCPU_DECL, uint64_t gpa, struct vie *vie,
  627     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
  628 {
  629         int error, size;
  630         enum vm_reg_name reg;
  631         uint64_t val;
  632 
  633         size = vie->opsize;
  634         error = EINVAL;
  635 
  636         switch (vie->op.op_byte) {
  637         case 0xB6:
  638                 /*
  639                  * MOV and zero extend byte from mem (ModRM:r/m) to
  640                  * reg (ModRM:reg).
  641                  *
  642                  * 0F B6/r              movzx r16, r/m8
  643                  * 0F B6/r              movzx r32, r/m8
  644                  * REX.W + 0F B6/r      movzx r64, r/m8
  645                  */
  646 
  647                 /* get the first operand */
  648                 error = memread(VCPU_ARGS, gpa, &val, 1, arg);
  649                 if (error)
  650                         break;
  651 
  652                 /* get the second operand */
  653                 reg = gpr_map[vie->reg];
  654 
  655                 /* zero-extend byte */
  656                 val = (uint8_t)val;
  657 
  658                 /* write the result */
  659                 error = vie_update_register(VCPU_ARGS, reg, val, size);
  660                 break;
  661         case 0xB7:
  662                 /*
  663                  * MOV and zero extend word from mem (ModRM:r/m) to
  664                  * reg (ModRM:reg).
  665                  *
  666                  * 0F B7/r              movzx r32, r/m16
  667                  * REX.W + 0F B7/r      movzx r64, r/m16
  668                  */
  669                 error = memread(VCPU_ARGS, gpa, &val, 2, arg);
  670                 if (error)
  671                         return (error);
  672 
  673                 reg = gpr_map[vie->reg];
  674 
  675                 /* zero-extend word */
  676                 val = (uint16_t)val;
  677 
  678                 error = vie_update_register(VCPU_ARGS, reg, val, size);
  679                 break;
  680         case 0xBE:
  681                 /*
  682                  * MOV and sign extend byte from mem (ModRM:r/m) to
  683                  * reg (ModRM:reg).
  684                  *
  685                  * 0F BE/r              movsx r16, r/m8
  686                  * 0F BE/r              movsx r32, r/m8
  687                  * REX.W + 0F BE/r      movsx r64, r/m8
  688                  */
  689 
  690                 /* get the first operand */
  691                 error = memread(VCPU_ARGS, gpa, &val, 1, arg);
  692                 if (error)
  693                         break;
  694 
  695                 /* get the second operand */
  696                 reg = gpr_map[vie->reg];
  697 
  698                 /* sign extend byte */
  699                 val = (int8_t)val;
  700 
  701                 /* write the result */
  702                 error = vie_update_register(VCPU_ARGS, reg, val, size);
  703                 break;
  704         default:
  705                 break;
  706         }
  707         return (error);
  708 }
  709 
  710 /*
  711  * Helper function to calculate and validate a linear address.
  712  */
  713 static int
  714 get_gla(VCPU_DECL, struct vie *vie __unused,
  715     struct vm_guest_paging *paging, int opsize, int addrsize, int prot,
  716     enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla, int *fault)
  717 {
  718         struct seg_desc desc;
  719         uint64_t cr0, val, rflags;
  720         int error __diagused;
  721 
  722         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_CR0, &cr0);
  723         KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
  724 
  725         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
  726         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  727 
  728         error = vm_get_seg_desc(VCPU_ARGS, seg, &desc);
  729         KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
  730             __func__, error, seg));
  731 
  732         error = vie_read_register(VCPU_ARGS, gpr, &val);
  733         KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
  734             error, gpr));
  735 
  736         if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
  737             addrsize, prot, gla)) {
  738                 if (seg == VM_REG_GUEST_SS)
  739                         vm_inject_ss(VCPU_ARGS, 0);
  740                 else
  741                         vm_inject_gp(VCPU_ARGS);
  742                 goto guest_fault;
  743         }
  744 
  745         if (vie_canonical_check(paging->cpu_mode, *gla)) {
  746                 if (seg == VM_REG_GUEST_SS)
  747                         vm_inject_ss(VCPU_ARGS, 0);
  748                 else
  749                         vm_inject_gp(VCPU_ARGS);
  750                 goto guest_fault;
  751         }
  752 
  753         if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
  754                 vm_inject_ac(VCPU_ARGS, 0);
  755                 goto guest_fault;
  756         }
  757 
  758         *fault = 0;
  759         return (0);
  760 
  761 guest_fault:
  762         *fault = 1;
  763         return (0);
  764 }
  765 
  766 static int
  767 emulate_movs(VCPU_DECL, uint64_t gpa, struct vie *vie,
  768     struct vm_guest_paging *paging, mem_region_read_t memread,
  769     mem_region_write_t memwrite, void *arg)
  770 {
  771 #ifdef _KERNEL
  772         struct vm_copyinfo copyinfo[2];
  773 #else
  774         struct iovec copyinfo[2];
  775 #endif
  776         uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
  777         uint64_t rcx, rdi, rsi, rflags;
  778         int error, fault, opsize, seg, repeat;
  779 
  780         opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
  781         val = 0;
  782         error = 0;
  783 
  784         /*
  785          * XXX although the MOVS instruction is only supposed to be used with
  786          * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
  787          *
  788          * Empirically the "repnz" prefix has identical behavior to "rep"
  789          * and the zero flag does not make a difference.
  790          */
  791         repeat = vie->repz_present | vie->repnz_present;
  792 
  793         if (repeat) {
  794                 error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RCX, &rcx);
  795                 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
  796 
  797                 /*
  798                  * The count register is %rcx, %ecx or %cx depending on the
  799                  * address size of the instruction.
  800                  */
  801                 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
  802                         error = 0;
  803                         goto done;
  804                 }
  805         }
  806 
  807         /*
  808          *      Source          Destination     Comments
  809          *      --------------------------------------------
  810          * (1)  memory          memory          n/a
  811          * (2)  memory          mmio            emulated
  812          * (3)  mmio            memory          emulated
  813          * (4)  mmio            mmio            emulated
  814          *
  815          * At this point we don't have sufficient information to distinguish
  816          * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
  817          * out because it will succeed only when operating on regular memory.
  818          *
  819          * XXX the emulation doesn't properly handle the case where 'gpa'
  820          * is straddling the boundary between the normal memory and MMIO.
  821          */
  822 
  823         seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
  824         error = get_gla(VCPU_ARGS, vie, paging, opsize, vie->addrsize,
  825             PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
  826         if (error || fault)
  827                 goto done;
  828 
  829         error = vm_copy_setup(VCPU_ARGS, paging, srcaddr, opsize, PROT_READ,
  830             copyinfo, nitems(copyinfo), &fault);
  831         if (error == 0) {
  832                 if (fault)
  833                         goto done;      /* Resume guest to handle fault */
  834 
  835                 /*
  836                  * case (2): read from system memory and write to mmio.
  837                  */
  838                 vm_copyin(copyinfo, &val, opsize);
  839                 vm_copy_teardown(copyinfo, nitems(copyinfo));
  840                 error = memwrite(VCPU_ARGS, gpa, val, opsize, arg);
  841                 if (error)
  842                         goto done;
  843         } else {
  844                 /*
  845                  * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
  846                  * if 'srcaddr' is in the mmio space.
  847                  */
  848 
  849                 error = get_gla(VCPU_ARGS, vie, paging, opsize, vie->addrsize,
  850                     PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
  851                     &fault);
  852                 if (error || fault)
  853                         goto done;
  854 
  855                 error = vm_copy_setup(VCPU_ARGS, paging, dstaddr, opsize,
  856                     PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
  857                 if (error == 0) {
  858                         if (fault)
  859                                 goto done;    /* Resume guest to handle fault */
  860 
  861                         /*
  862                          * case (3): read from MMIO and write to system memory.
  863                          *
  864                          * A MMIO read can have side-effects so we
  865                          * commit to it only after vm_copy_setup() is
  866                          * successful. If a page-fault needs to be
  867                          * injected into the guest then it will happen
  868                          * before the MMIO read is attempted.
  869                          */
  870                         error = memread(VCPU_ARGS, gpa, &val, opsize, arg);
  871                         if (error)
  872                                 goto done;
  873 
  874                         vm_copyout(&val, copyinfo, opsize);
  875                         vm_copy_teardown(copyinfo, nitems(copyinfo));
  876                 } else {
  877                         /*
  878                          * Case (4): read from and write to mmio.
  879                          *
  880                          * Commit to the MMIO read/write (with potential
  881                          * side-effects) only after we are sure that the
  882                          * instruction is not going to be restarted due
  883                          * to address translation faults.
  884                          */
  885                         error = vm_gla2gpa(VCPU_ARGS, paging, srcaddr,
  886                             PROT_READ, &srcgpa, &fault);
  887                         if (error || fault)
  888                                 goto done;
  889 
  890                         error = vm_gla2gpa(VCPU_ARGS, paging, dstaddr,
  891                            PROT_WRITE, &dstgpa, &fault);
  892                         if (error || fault)
  893                                 goto done;
  894 
  895                         error = memread(VCPU_ARGS, srcgpa, &val, opsize, arg);
  896                         if (error)
  897                                 goto done;
  898 
  899                         error = memwrite(VCPU_ARGS, dstgpa, val, opsize, arg);
  900                         if (error)
  901                                 goto done;
  902                 }
  903         }
  904 
  905         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RSI, &rsi);
  906         KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
  907 
  908         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RDI, &rdi);
  909         KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
  910 
  911         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
  912         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  913 
  914         if (rflags & PSL_D) {
  915                 rsi -= opsize;
  916                 rdi -= opsize;
  917         } else {
  918                 rsi += opsize;
  919                 rdi += opsize;
  920         }
  921 
  922         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RSI, rsi,
  923             vie->addrsize);
  924         KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
  925 
  926         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RDI, rdi,
  927             vie->addrsize);
  928         KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
  929 
  930         if (repeat) {
  931                 rcx = rcx - 1;
  932                 error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RCX,
  933                     rcx, vie->addrsize);
  934                 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
  935 
  936                 /*
  937                  * Repeat the instruction if the count register is not zero.
  938                  */
  939                 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
  940                         vm_restart_instruction(VCPU_ARGS);
  941         }
  942 done:
  943         KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
  944             __func__, error));
  945         return (error);
  946 }
  947 
  948 static int
  949 emulate_stos(VCPU_DECL, uint64_t gpa, struct vie *vie,
  950     struct vm_guest_paging *paging __unused, mem_region_read_t memread __unused,
  951     mem_region_write_t memwrite, void *arg)
  952 {
  953         int error, opsize, repeat;
  954         uint64_t val;
  955         uint64_t rcx, rdi, rflags;
  956 
  957         opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
  958         repeat = vie->repz_present | vie->repnz_present;
  959 
  960         if (repeat) {
  961                 error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RCX, &rcx);
  962                 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
  963 
  964                 /*
  965                  * The count register is %rcx, %ecx or %cx depending on the
  966                  * address size of the instruction.
  967                  */
  968                 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
  969                         return (0);
  970         }
  971 
  972         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RAX, &val);
  973         KASSERT(!error, ("%s: error %d getting rax", __func__, error));
  974 
  975         error = memwrite(VCPU_ARGS, gpa, val, opsize, arg);
  976         if (error)
  977                 return (error);
  978 
  979         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RDI, &rdi);
  980         KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
  981 
  982         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
  983         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  984 
  985         if (rflags & PSL_D)
  986                 rdi -= opsize;
  987         else
  988                 rdi += opsize;
  989 
  990         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RDI, rdi,
  991             vie->addrsize);
  992         KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
  993 
  994         if (repeat) {
  995                 rcx = rcx - 1;
  996                 error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RCX,
  997                     rcx, vie->addrsize);
  998                 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
  999 
 1000                 /*
 1001                  * Repeat the instruction if the count register is not zero.
 1002                  */
 1003                 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
 1004                         vm_restart_instruction(VCPU_ARGS);
 1005         }
 1006 
 1007         return (0);
 1008 }
 1009 
 1010 static int
 1011 emulate_and(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1012             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 1013 {
 1014         int error, size;
 1015         enum vm_reg_name reg;
 1016         uint64_t result, rflags, rflags2, val1, val2;
 1017 
 1018         size = vie->opsize;
 1019         error = EINVAL;
 1020 
 1021         switch (vie->op.op_byte) {
 1022         case 0x23:
 1023                 /*
 1024                  * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
 1025                  * result in reg.
 1026                  *
 1027                  * 23/r         and r16, r/m16
 1028                  * 23/r         and r32, r/m32
 1029                  * REX.W + 23/r and r64, r/m64
 1030                  */
 1031 
 1032                 /* get the first operand */
 1033                 reg = gpr_map[vie->reg];
 1034                 error = vie_read_register(VCPU_ARGS, reg, &val1);
 1035                 if (error)
 1036                         break;
 1037 
 1038                 /* get the second operand */
 1039                 error = memread(VCPU_ARGS, gpa, &val2, size, arg);
 1040                 if (error)
 1041                         break;
 1042 
 1043                 /* perform the operation and write the result */
 1044                 result = val1 & val2;
 1045                 error = vie_update_register(VCPU_ARGS, reg, result, size);
 1046                 break;
 1047         case 0x81:
 1048         case 0x83:
 1049                 /*
 1050                  * AND mem (ModRM:r/m) with immediate and store the
 1051                  * result in mem.
 1052                  *
 1053                  * 81 /4                and r/m16, imm16
 1054                  * 81 /4                and r/m32, imm32
 1055                  * REX.W + 81 /4        and r/m64, imm32 sign-extended to 64
 1056                  *
 1057                  * 83 /4                and r/m16, imm8 sign-extended to 16
 1058                  * 83 /4                and r/m32, imm8 sign-extended to 32
 1059                  * REX.W + 83/4         and r/m64, imm8 sign-extended to 64
 1060                  */
 1061 
 1062                 /* get the first operand */
 1063                 error = memread(VCPU_ARGS, gpa, &val1, size, arg);
 1064                 if (error)
 1065                         break;
 1066 
 1067                 /*
 1068                  * perform the operation with the pre-fetched immediate
 1069                  * operand and write the result
 1070                  */
 1071                 result = val1 & vie->immediate;
 1072                 error = memwrite(VCPU_ARGS, gpa, result, size, arg);
 1073                 break;
 1074         default:
 1075                 break;
 1076         }
 1077         if (error)
 1078                 return (error);
 1079 
 1080         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1081         if (error)
 1082                 return (error);
 1083 
 1084         /*
 1085          * OF and CF are cleared; the SF, ZF and PF flags are set according
 1086          * to the result; AF is undefined.
 1087          *
 1088          * The updated status flags are obtained by subtracting 0 from 'result'.
 1089          */
 1090         rflags2 = getcc(size, result, 0);
 1091         rflags &= ~RFLAGS_STATUS_BITS;
 1092         rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 1093 
 1094         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8);
 1095         return (error);
 1096 }
 1097 
 1098 static int
 1099 emulate_or(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1100             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 1101 {
 1102         int error, size;
 1103         enum vm_reg_name reg;
 1104         uint64_t result, rflags, rflags2, val1, val2;
 1105 
 1106         size = vie->opsize;
 1107         error = EINVAL;
 1108 
 1109         switch (vie->op.op_byte) {
 1110         case 0x0B:
 1111                 /*
 1112                  * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the
 1113                  * result in reg.
 1114                  *
 1115                  * 0b/r         or r16, r/m16
 1116                  * 0b/r         or r32, r/m32
 1117                  * REX.W + 0b/r or r64, r/m64
 1118                  */
 1119 
 1120                 /* get the first operand */
 1121                 reg = gpr_map[vie->reg];
 1122                 error = vie_read_register(VCPU_ARGS, reg, &val1);
 1123                 if (error)
 1124                         break;
 1125                 
 1126                 /* get the second operand */
 1127                 error = memread(VCPU_ARGS, gpa, &val2, size, arg);
 1128                 if (error)
 1129                         break;
 1130 
 1131                 /* perform the operation and write the result */
 1132                 result = val1 | val2;
 1133                 error = vie_update_register(VCPU_ARGS, reg, result, size);
 1134                 break;
 1135         case 0x81:
 1136         case 0x83:
 1137                 /*
 1138                  * OR mem (ModRM:r/m) with immediate and store the
 1139                  * result in mem.
 1140                  *
 1141                  * 81 /1                or r/m16, imm16
 1142                  * 81 /1                or r/m32, imm32
 1143                  * REX.W + 81 /1        or r/m64, imm32 sign-extended to 64
 1144                  *
 1145                  * 83 /1                or r/m16, imm8 sign-extended to 16
 1146                  * 83 /1                or r/m32, imm8 sign-extended to 32
 1147                  * REX.W + 83/1         or r/m64, imm8 sign-extended to 64
 1148                  */
 1149 
 1150                 /* get the first operand */
 1151                 error = memread(VCPU_ARGS, gpa, &val1, size, arg);
 1152                 if (error)
 1153                         break;
 1154 
 1155                 /*
 1156                  * perform the operation with the pre-fetched immediate
 1157                  * operand and write the result
 1158                  */
 1159                 result = val1 | vie->immediate;
 1160                 error = memwrite(VCPU_ARGS, gpa, result, size, arg);
 1161                 break;
 1162         default:
 1163                 break;
 1164         }
 1165         if (error)
 1166                 return (error);
 1167 
 1168         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1169         if (error)
 1170                 return (error);
 1171 
 1172         /*
 1173          * OF and CF are cleared; the SF, ZF and PF flags are set according
 1174          * to the result; AF is undefined.
 1175          *
 1176          * The updated status flags are obtained by subtracting 0 from 'result'.
 1177          */
 1178         rflags2 = getcc(size, result, 0);
 1179         rflags &= ~RFLAGS_STATUS_BITS;
 1180         rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 1181 
 1182         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8);
 1183         return (error);
 1184 }
 1185 
 1186 static int
 1187 emulate_cmp(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1188     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
 1189 {
 1190         int error, size;
 1191         uint64_t regop, memop, op1, op2, rflags, rflags2;
 1192         enum vm_reg_name reg;
 1193 
 1194         size = vie->opsize;
 1195         switch (vie->op.op_byte) {
 1196         case 0x39:
 1197         case 0x3B:
 1198                 /*
 1199                  * 39/r         CMP r/m16, r16
 1200                  * 39/r         CMP r/m32, r32
 1201                  * REX.W 39/r   CMP r/m64, r64
 1202                  *
 1203                  * 3B/r         CMP r16, r/m16
 1204                  * 3B/r         CMP r32, r/m32
 1205                  * REX.W + 3B/r CMP r64, r/m64
 1206                  *
 1207                  * Compare the first operand with the second operand and
 1208                  * set status flags in EFLAGS register. The comparison is
 1209                  * performed by subtracting the second operand from the first
 1210                  * operand and then setting the status flags.
 1211                  */
 1212 
 1213                 /* Get the register operand */
 1214                 reg = gpr_map[vie->reg];
 1215                 error = vie_read_register(VCPU_ARGS, reg, &regop);
 1216                 if (error)
 1217                         return (error);
 1218 
 1219                 /* Get the memory operand */
 1220                 error = memread(VCPU_ARGS, gpa, &memop, size, arg);
 1221                 if (error)
 1222                         return (error);
 1223 
 1224                 if (vie->op.op_byte == 0x3B) {
 1225                         op1 = regop;
 1226                         op2 = memop;
 1227                 } else {
 1228                         op1 = memop;
 1229                         op2 = regop;
 1230                 }
 1231                 rflags2 = getcc(size, op1, op2);
 1232                 break;
 1233         case 0x80:
 1234         case 0x81:
 1235         case 0x83:
 1236                 /*
 1237                  * 80 /7                cmp r/m8, imm8
 1238                  * REX + 80 /7          cmp r/m8, imm8
 1239                  *
 1240                  * 81 /7                cmp r/m16, imm16
 1241                  * 81 /7                cmp r/m32, imm32
 1242                  * REX.W + 81 /7        cmp r/m64, imm32 sign-extended to 64
 1243                  *
 1244                  * 83 /7                cmp r/m16, imm8 sign-extended to 16
 1245                  * 83 /7                cmp r/m32, imm8 sign-extended to 32
 1246                  * REX.W + 83 /7        cmp r/m64, imm8 sign-extended to 64
 1247                  *
 1248                  * Compare mem (ModRM:r/m) with immediate and set
 1249                  * status flags according to the results.  The
 1250                  * comparison is performed by subtracting the
 1251                  * immediate from the first operand and then setting
 1252                  * the status flags.
 1253                  *
 1254                  */
 1255                 if (vie->op.op_byte == 0x80)
 1256                         size = 1;
 1257 
 1258                 /* get the first operand */
 1259                 error = memread(VCPU_ARGS, gpa, &op1, size, arg);
 1260                 if (error)
 1261                         return (error);
 1262 
 1263                 rflags2 = getcc(size, op1, vie->immediate);
 1264                 break;
 1265         default:
 1266                 return (EINVAL);
 1267         }
 1268         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1269         if (error)
 1270                 return (error);
 1271         rflags &= ~RFLAGS_STATUS_BITS;
 1272         rflags |= rflags2 & RFLAGS_STATUS_BITS;
 1273 
 1274         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8);
 1275         return (error);
 1276 }
 1277 
 1278 static int
 1279 emulate_test(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1280     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
 1281 {
 1282         int error, size;
 1283         uint64_t op1, rflags, rflags2;
 1284 
 1285         size = vie->opsize;
 1286         error = EINVAL;
 1287 
 1288         switch (vie->op.op_byte) {
 1289         case 0xF7:
 1290                 /*
 1291                  * F7 /0                test r/m16, imm16
 1292                  * F7 /0                test r/m32, imm32
 1293                  * REX.W + F7 /0        test r/m64, imm32 sign-extended to 64
 1294                  *
 1295                  * Test mem (ModRM:r/m) with immediate and set status
 1296                  * flags according to the results.  The comparison is
 1297                  * performed by anding the immediate from the first
 1298                  * operand and then setting the status flags.
 1299                  */
 1300                 if ((vie->reg & 7) != 0)
 1301                         return (EINVAL);
 1302 
 1303                 error = memread(VCPU_ARGS, gpa, &op1, size, arg);
 1304                 if (error)
 1305                         return (error);
 1306 
 1307                 rflags2 = getandflags(size, op1, vie->immediate);
 1308                 break;
 1309         default:
 1310                 return (EINVAL);
 1311         }
 1312         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1313         if (error)
 1314                 return (error);
 1315 
 1316         /*
 1317          * OF and CF are cleared; the SF, ZF and PF flags are set according
 1318          * to the result; AF is undefined.
 1319          */
 1320         rflags &= ~RFLAGS_STATUS_BITS;
 1321         rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 1322 
 1323         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8);
 1324         return (error);
 1325 }
 1326 
 1327 static int
 1328 emulate_bextr(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1329     struct vm_guest_paging *paging, mem_region_read_t memread,
 1330     mem_region_write_t memwrite __unused, void *arg)
 1331 {
 1332         uint64_t src1, src2, dst, rflags;
 1333         unsigned start, len, size;
 1334         int error;
 1335 
 1336         size = vie->opsize;
 1337         error = EINVAL;
 1338 
 1339         /*
 1340          * VEX.LZ.0F38.W0 F7 /r         BEXTR r32a, r/m32, r32b
 1341          * VEX.LZ.0F38.W1 F7 /r         BEXTR r64a, r/m64, r64b
 1342          *
 1343          * Destination operand is ModRM:reg.  Source operands are ModRM:r/m and
 1344          * Vex.vvvv.
 1345          *
 1346          * Operand size is always 32-bit if not in 64-bit mode (W1 is ignored).
 1347          */
 1348         if (size != 4 && paging->cpu_mode != CPU_MODE_64BIT)
 1349                 size = 4;
 1350 
 1351         /*
 1352          * Extracts contiguous bits from the first /source/ operand (second
 1353          * operand) using an index and length specified in the second /source/
 1354          * operand (third operand).
 1355          */
 1356         error = memread(VCPU_ARGS, gpa, &src1, size, arg);
 1357         if (error)
 1358                 return (error);
 1359         error = vie_read_register(VCPU_ARGS, gpr_map[vie->vex_reg], &src2);
 1360         if (error)
 1361                 return (error);
 1362         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1363         if (error)
 1364                 return (error);
 1365 
 1366         start = (src2 & 0xff);
 1367         len = (src2 & 0xff00) >> 8;
 1368 
 1369         /* If no bits are extracted, the destination register is cleared. */
 1370         dst = 0;
 1371 
 1372         /* If START exceeds the operand size, no bits are extracted. */
 1373         if (start > size * 8)
 1374                 goto done;
 1375         /* Length is bounded by both the destination size and start offset. */
 1376         if (start + len > size * 8)
 1377                 len = (size * 8) - start;
 1378         if (len == 0)
 1379                 goto done;
 1380 
 1381         if (start > 0)
 1382                 src1 = (src1 >> start);
 1383         if (len < 64)
 1384                 src1 = src1 & ((1ull << len) - 1);
 1385         dst = src1;
 1386 
 1387 done:
 1388         error = vie_update_register(VCPU_ARGS, gpr_map[vie->reg], dst, size);
 1389         if (error)
 1390                 return (error);
 1391 
 1392         /*
 1393          * AMD: OF, CF cleared; SF/AF/PF undefined; ZF set by result.
 1394          * Intel: ZF is set by result; AF/SF/PF undefined; all others cleared.
 1395          */
 1396         rflags &= ~RFLAGS_STATUS_BITS;
 1397         if (dst == 0)
 1398                 rflags |= PSL_Z;
 1399         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags,
 1400             8);
 1401         return (error);
 1402 }
 1403 
 1404 static int
 1405 emulate_add(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1406     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
 1407 {
 1408         int error, size;
 1409         uint64_t nval, rflags, rflags2, val1, val2;
 1410         enum vm_reg_name reg;
 1411 
 1412         size = vie->opsize;
 1413         error = EINVAL;
 1414 
 1415         switch (vie->op.op_byte) {
 1416         case 0x03:
 1417                 /*
 1418                  * ADD r/m to r and store the result in r
 1419                  *
 1420                  * 03/r            ADD r16, r/m16
 1421                  * 03/r            ADD r32, r/m32
 1422                  * REX.W + 03/r    ADD r64, r/m64
 1423                  */
 1424 
 1425                 /* get the first operand */
 1426                 reg = gpr_map[vie->reg];
 1427                 error = vie_read_register(VCPU_ARGS, reg, &val1);
 1428                 if (error)
 1429                         break;
 1430 
 1431                 /* get the second operand */
 1432                 error = memread(VCPU_ARGS, gpa, &val2, size, arg);
 1433                 if (error)
 1434                         break;
 1435 
 1436                 /* perform the operation and write the result */
 1437                 nval = val1 + val2;
 1438                 error = vie_update_register(VCPU_ARGS, reg, nval, size);
 1439                 break;
 1440         default:
 1441                 break;
 1442         }
 1443 
 1444         if (!error) {
 1445                 rflags2 = getaddflags(size, val1, val2);
 1446                 error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS,
 1447                     &rflags);
 1448                 if (error)
 1449                         return (error);
 1450 
 1451                 rflags &= ~RFLAGS_STATUS_BITS;
 1452                 rflags |= rflags2 & RFLAGS_STATUS_BITS;
 1453                 error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS,
 1454                     rflags, 8);
 1455         }
 1456 
 1457         return (error);
 1458 }
 1459 
 1460 static int
 1461 emulate_sub(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1462     mem_region_read_t memread, mem_region_write_t memwrite __unused, void *arg)
 1463 {
 1464         int error, size;
 1465         uint64_t nval, rflags, rflags2, val1, val2;
 1466         enum vm_reg_name reg;
 1467 
 1468         size = vie->opsize;
 1469         error = EINVAL;
 1470 
 1471         switch (vie->op.op_byte) {
 1472         case 0x2B:
 1473                 /*
 1474                  * SUB r/m from r and store the result in r
 1475                  * 
 1476                  * 2B/r            SUB r16, r/m16
 1477                  * 2B/r            SUB r32, r/m32
 1478                  * REX.W + 2B/r    SUB r64, r/m64
 1479                  */
 1480 
 1481                 /* get the first operand */
 1482                 reg = gpr_map[vie->reg];
 1483                 error = vie_read_register(VCPU_ARGS, reg, &val1);
 1484                 if (error)
 1485                         break;
 1486 
 1487                 /* get the second operand */
 1488                 error = memread(VCPU_ARGS, gpa, &val2, size, arg);
 1489                 if (error)
 1490                         break;
 1491 
 1492                 /* perform the operation and write the result */
 1493                 nval = val1 - val2;
 1494                 error = vie_update_register(VCPU_ARGS, reg, nval, size);
 1495                 break;
 1496         default:
 1497                 break;
 1498         }
 1499 
 1500         if (!error) {
 1501                 rflags2 = getcc(size, val1, val2);
 1502                 error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS,
 1503                     &rflags);
 1504                 if (error)
 1505                         return (error);
 1506 
 1507                 rflags &= ~RFLAGS_STATUS_BITS;
 1508                 rflags |= rflags2 & RFLAGS_STATUS_BITS;
 1509                 error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS,
 1510                     rflags, 8);
 1511         }
 1512 
 1513         return (error);
 1514 }
 1515 
 1516 static int
 1517 emulate_stack_op(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie,
 1518     struct vm_guest_paging *paging, mem_region_read_t memread,
 1519     mem_region_write_t memwrite, void *arg)
 1520 {
 1521 #ifdef _KERNEL
 1522         struct vm_copyinfo copyinfo[2];
 1523 #else
 1524         struct iovec copyinfo[2];
 1525 #endif
 1526         struct seg_desc ss_desc;
 1527         uint64_t cr0, rflags, rsp, stack_gla, val;
 1528         int error, fault, size, stackaddrsize, pushop;
 1529 
 1530         val = 0;
 1531         size = vie->opsize;
 1532         pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
 1533 
 1534         /*
 1535          * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 1536          */
 1537         if (paging->cpu_mode == CPU_MODE_REAL) {
 1538                 stackaddrsize = 2;
 1539         } else if (paging->cpu_mode == CPU_MODE_64BIT) {
 1540                 /*
 1541                  * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
 1542                  * - Stack pointer size is always 64-bits.
 1543                  * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
 1544                  * - 16-bit PUSH/POP is supported by using the operand size
 1545                  *   override prefix (66H).
 1546                  */
 1547                 stackaddrsize = 8;
 1548                 size = vie->opsize_override ? 2 : 8;
 1549         } else {
 1550                 /*
 1551                  * In protected or compatibility mode the 'B' flag in the
 1552                  * stack-segment descriptor determines the size of the
 1553                  * stack pointer.
 1554                  */
 1555                 error = vm_get_seg_desc(VCPU_ARGS, VM_REG_GUEST_SS, &ss_desc);
 1556                 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
 1557                     __func__, error));
 1558                 if (SEG_DESC_DEF32(ss_desc.access))
 1559                         stackaddrsize = 4;
 1560                 else
 1561                         stackaddrsize = 2;
 1562         }
 1563 
 1564         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_CR0, &cr0);
 1565         KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 1566 
 1567         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1568         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 1569 
 1570         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RSP, &rsp);
 1571         KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
 1572         if (pushop) {
 1573                 rsp -= size;
 1574         }
 1575 
 1576         if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
 1577             rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
 1578             &stack_gla)) {
 1579                 vm_inject_ss(VCPU_ARGS, 0);
 1580                 return (0);
 1581         }
 1582 
 1583         if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
 1584                 vm_inject_ss(VCPU_ARGS, 0);
 1585                 return (0);
 1586         }
 1587 
 1588         if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
 1589                 vm_inject_ac(VCPU_ARGS, 0);
 1590                 return (0);
 1591         }
 1592 
 1593         error = vm_copy_setup(VCPU_ARGS, paging, stack_gla, size,
 1594             pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
 1595             &fault);
 1596         if (error || fault)
 1597                 return (error);
 1598 
 1599         if (pushop) {
 1600                 error = memread(VCPU_ARGS, mmio_gpa, &val, size, arg);
 1601                 if (error == 0)
 1602                         vm_copyout(&val, copyinfo, size);
 1603         } else {
 1604                 vm_copyin(copyinfo, &val, size);
 1605                 error = memwrite(VCPU_ARGS, mmio_gpa, val, size, arg);
 1606                 rsp += size;
 1607         }
 1608         vm_copy_teardown(copyinfo, nitems(copyinfo));
 1609 
 1610         if (error == 0) {
 1611                 error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RSP, rsp,
 1612                     stackaddrsize);
 1613                 KASSERT(error == 0, ("error %d updating rsp", error));
 1614         }
 1615         return (error);
 1616 }
 1617 
 1618 static int
 1619 emulate_push(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie,
 1620     struct vm_guest_paging *paging, mem_region_read_t memread,
 1621     mem_region_write_t memwrite, void *arg)
 1622 {
 1623         int error;
 1624 
 1625         /*
 1626          * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 1627          *
 1628          * PUSH is part of the group 5 extended opcodes and is identified
 1629          * by ModRM:reg = b110.
 1630          */
 1631         if ((vie->reg & 7) != 6)
 1632                 return (EINVAL);
 1633 
 1634         error = emulate_stack_op(VCPU_ARGS, mmio_gpa, vie, paging, memread,
 1635             memwrite, arg);
 1636         return (error);
 1637 }
 1638 
 1639 static int
 1640 emulate_pop(VCPU_DECL, uint64_t mmio_gpa, struct vie *vie,
 1641     struct vm_guest_paging *paging, mem_region_read_t memread,
 1642     mem_region_write_t memwrite, void *arg)
 1643 {
 1644         int error;
 1645 
 1646         /*
 1647          * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 1648          *
 1649          * POP is part of the group 1A extended opcodes and is identified
 1650          * by ModRM:reg = b000.
 1651          */
 1652         if ((vie->reg & 7) != 0)
 1653                 return (EINVAL);
 1654 
 1655         error = emulate_stack_op(VCPU_ARGS, mmio_gpa, vie, paging, memread,
 1656             memwrite, arg);
 1657         return (error);
 1658 }
 1659 
 1660 static int
 1661 emulate_group1(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1662     struct vm_guest_paging *paging __unused, mem_region_read_t memread,
 1663     mem_region_write_t memwrite, void *memarg)
 1664 {
 1665         int error;
 1666 
 1667         switch (vie->reg & 7) {
 1668         case 0x1:       /* OR */
 1669                 error = emulate_or(VCPU_ARGS, gpa, vie,
 1670                     memread, memwrite, memarg);
 1671                 break;
 1672         case 0x4:       /* AND */
 1673                 error = emulate_and(VCPU_ARGS, gpa, vie,
 1674                     memread, memwrite, memarg);
 1675                 break;
 1676         case 0x7:       /* CMP */
 1677                 error = emulate_cmp(VCPU_ARGS, gpa, vie,
 1678                     memread, memwrite, memarg);
 1679                 break;
 1680         default:
 1681                 error = EINVAL;
 1682                 break;
 1683         }
 1684 
 1685         return (error);
 1686 }
 1687 
 1688 static int
 1689 emulate_bittest(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1690     mem_region_read_t memread, mem_region_write_t memwrite __unused,
 1691     void *memarg)
 1692 {
 1693         uint64_t val, rflags;
 1694         int error, bitmask, bitoff;
 1695 
 1696         /*
 1697          * 0F BA is a Group 8 extended opcode.
 1698          *
 1699          * Currently we only emulate the 'Bit Test' instruction which is
 1700          * identified by a ModR/M:reg encoding of 100b.
 1701          */
 1702         if ((vie->reg & 7) != 4)
 1703                 return (EINVAL);
 1704 
 1705         error = vie_read_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, &rflags);
 1706         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 1707 
 1708         error = memread(VCPU_ARGS, gpa, &val, vie->opsize, memarg);
 1709         if (error)
 1710                 return (error);
 1711 
 1712         /*
 1713          * Intel SDM, Vol 2, Table 3-2:
 1714          * "Range of Bit Positions Specified by Bit Offset Operands"
 1715          */
 1716         bitmask = vie->opsize * 8 - 1;
 1717         bitoff = vie->immediate & bitmask;
 1718 
 1719         /* Copy the bit into the Carry flag in %rflags */
 1720         if (val & (1UL << bitoff))
 1721                 rflags |= PSL_C;
 1722         else
 1723                 rflags &= ~PSL_C;
 1724 
 1725         error = vie_update_register(VCPU_ARGS, VM_REG_GUEST_RFLAGS, rflags, 8);
 1726         KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
 1727 
 1728         return (0);
 1729 }
 1730 
 1731 static int
 1732 emulate_twob_group15(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1733     mem_region_read_t memread, mem_region_write_t memwrite __unused,
 1734     void *memarg)
 1735 {
 1736         int error;
 1737         uint64_t buf;
 1738 
 1739         switch (vie->reg & 7) {
 1740         case 0x7:       /* CLFLUSH, CLFLUSHOPT, and SFENCE */
 1741                 if (vie->mod == 0x3) {
 1742                         /*
 1743                          * SFENCE.  Ignore it, VM exit provides enough
 1744                          * barriers on its own.
 1745                          */
 1746                         error = 0;
 1747                 } else {
 1748                         /*
 1749                          * CLFLUSH, CLFLUSHOPT.  Only check for access
 1750                          * rights.
 1751                          */
 1752                         error = memread(VCPU_ARGS, gpa, &buf, 1, memarg);
 1753                 }
 1754                 break;
 1755         default:
 1756                 error = EINVAL;
 1757                 break;
 1758         }
 1759 
 1760         return (error);
 1761 }
 1762 
 1763 int
 1764 vmm_emulate_instruction(VCPU_DECL, uint64_t gpa, struct vie *vie,
 1765     struct vm_guest_paging *paging, mem_region_read_t memread,
 1766     mem_region_write_t memwrite, void *memarg)
 1767 {
 1768         int error;
 1769 
 1770         if (!vie->decoded)
 1771                 return (EINVAL);
 1772 
 1773         switch (vie->op.op_type) {
 1774         case VIE_OP_TYPE_GROUP1:
 1775                 error = emulate_group1(VCPU_ARGS, gpa, vie, paging, memread,
 1776                     memwrite, memarg);
 1777                 break;
 1778         case VIE_OP_TYPE_POP:
 1779                 error = emulate_pop(VCPU_ARGS, gpa, vie, paging, memread,
 1780                     memwrite, memarg);
 1781                 break;
 1782         case VIE_OP_TYPE_PUSH:
 1783                 error = emulate_push(VCPU_ARGS, gpa, vie, paging, memread,
 1784                     memwrite, memarg);
 1785                 break;
 1786         case VIE_OP_TYPE_CMP:
 1787                 error = emulate_cmp(VCPU_ARGS, gpa, vie,
 1788                                     memread, memwrite, memarg);
 1789                 break;
 1790         case VIE_OP_TYPE_MOV:
 1791                 error = emulate_mov(VCPU_ARGS, gpa, vie,
 1792                                     memread, memwrite, memarg);
 1793                 break;
 1794         case VIE_OP_TYPE_MOVSX:
 1795         case VIE_OP_TYPE_MOVZX:
 1796                 error = emulate_movx(VCPU_ARGS, gpa, vie,
 1797                                      memread, memwrite, memarg);
 1798                 break;
 1799         case VIE_OP_TYPE_MOVS:
 1800                 error = emulate_movs(VCPU_ARGS, gpa, vie, paging, memread,
 1801                     memwrite, memarg);
 1802                 break;
 1803         case VIE_OP_TYPE_STOS:
 1804                 error = emulate_stos(VCPU_ARGS, gpa, vie, paging, memread,
 1805                     memwrite, memarg);
 1806                 break;
 1807         case VIE_OP_TYPE_AND:
 1808                 error = emulate_and(VCPU_ARGS, gpa, vie,
 1809                                     memread, memwrite, memarg);
 1810                 break;
 1811         case VIE_OP_TYPE_OR:
 1812                 error = emulate_or(VCPU_ARGS, gpa, vie,
 1813                                     memread, memwrite, memarg);
 1814                 break;
 1815         case VIE_OP_TYPE_SUB:
 1816                 error = emulate_sub(VCPU_ARGS, gpa, vie,
 1817                                     memread, memwrite, memarg);
 1818                 break;
 1819         case VIE_OP_TYPE_BITTEST:
 1820                 error = emulate_bittest(VCPU_ARGS, gpa, vie,
 1821                     memread, memwrite, memarg);
 1822                 break;
 1823         case VIE_OP_TYPE_TWOB_GRP15:
 1824                 error = emulate_twob_group15(VCPU_ARGS, gpa, vie,
 1825                     memread, memwrite, memarg);
 1826                 break;
 1827         case VIE_OP_TYPE_ADD:
 1828                 error = emulate_add(VCPU_ARGS, gpa, vie, memread,
 1829                     memwrite, memarg);
 1830                 break;
 1831         case VIE_OP_TYPE_TEST:
 1832                 error = emulate_test(VCPU_ARGS, gpa, vie,
 1833                     memread, memwrite, memarg);
 1834                 break;
 1835         case VIE_OP_TYPE_BEXTR:
 1836                 error = emulate_bextr(VCPU_ARGS, gpa, vie, paging,
 1837                     memread, memwrite, memarg);
 1838                 break;
 1839         default:
 1840                 error = EINVAL;
 1841                 break;
 1842         }
 1843 
 1844         return (error);
 1845 }
 1846 
 1847 int
 1848 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
 1849 {
 1850         KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 1851             ("%s: invalid size %d", __func__, size));
 1852         KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
 1853 
 1854         if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
 1855                 return (0);
 1856 
 1857         return ((gla & (size - 1)) ? 1 : 0);
 1858 }
 1859 
 1860 int
 1861 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
 1862 {
 1863         uint64_t mask;
 1864 
 1865         if (cpu_mode != CPU_MODE_64BIT)
 1866                 return (0);
 1867 
 1868         /*
 1869          * The value of the bit 47 in the 'gla' should be replicated in the
 1870          * most significant 16 bits.
 1871          */
 1872         mask = ~((1UL << 48) - 1);
 1873         if (gla & (1UL << 47))
 1874                 return ((gla & mask) != mask);
 1875         else
 1876                 return ((gla & mask) != 0);
 1877 }
 1878 
 1879 uint64_t
 1880 vie_size2mask(int size)
 1881 {
 1882         KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 1883             ("vie_size2mask: invalid size %d", size));
 1884         return (size2mask[size]);
 1885 }
 1886 
 1887 int
 1888 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 1889     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
 1890     int prot, uint64_t *gla)
 1891 {
 1892         uint64_t firstoff, low_limit, high_limit, segbase;
 1893         int glasize, type;
 1894 
 1895         KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
 1896             ("%s: invalid segment %d", __func__, seg));
 1897         KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
 1898             ("%s: invalid operand size %d", __func__, length));
 1899         KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
 1900             ("%s: invalid prot %#x", __func__, prot));
 1901 
 1902         firstoff = offset;
 1903         if (cpu_mode == CPU_MODE_64BIT) {
 1904                 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
 1905                     "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
 1906                 glasize = 8;
 1907         } else {
 1908                 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
 1909                     "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
 1910                 glasize = 4;
 1911                 /*
 1912                  * If the segment selector is loaded with a NULL selector
 1913                  * then the descriptor is unusable and attempting to use
 1914                  * it results in a #GP(0).
 1915                  */
 1916                 if (SEG_DESC_UNUSABLE(desc->access))
 1917                         return (-1);
 1918 
 1919                 /* 
 1920                  * The processor generates a #NP exception when a segment
 1921                  * register is loaded with a selector that points to a
 1922                  * descriptor that is not present. If this was the case then
 1923                  * it would have been checked before the VM-exit.
 1924                  */
 1925                 KASSERT(SEG_DESC_PRESENT(desc->access),
 1926                     ("segment %d not present: %#x", seg, desc->access));
 1927 
 1928                 /*
 1929                  * The descriptor type must indicate a code/data segment.
 1930                  */
 1931                 type = SEG_DESC_TYPE(desc->access);
 1932                 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 1933                     "descriptor type %#x", seg, type));
 1934 
 1935                 if (prot & PROT_READ) {
 1936                         /* #GP on a read access to a exec-only code segment */
 1937                         if ((type & 0xA) == 0x8)
 1938                                 return (-1);
 1939                 }
 1940 
 1941                 if (prot & PROT_WRITE) {
 1942                         /*
 1943                          * #GP on a write access to a code segment or a
 1944                          * read-only data segment.
 1945                          */
 1946                         if (type & 0x8)                 /* code segment */
 1947                                 return (-1);
 1948 
 1949                         if ((type & 0xA) == 0)          /* read-only data seg */
 1950                                 return (-1);
 1951                 }
 1952 
 1953                 /*
 1954                  * 'desc->limit' is fully expanded taking granularity into
 1955                  * account.
 1956                  */
 1957                 if ((type & 0xC) == 0x4) {
 1958                         /* expand-down data segment */
 1959                         low_limit = desc->limit + 1;
 1960                         high_limit = SEG_DESC_DEF32(desc->access) ?
 1961                             0xffffffff : 0xffff;
 1962                 } else {
 1963                         /* code segment or expand-up data segment */
 1964                         low_limit = 0;
 1965                         high_limit = desc->limit;
 1966                 }
 1967 
 1968                 while (length > 0) {
 1969                         offset &= vie_size2mask(addrsize);
 1970                         if (offset < low_limit || offset > high_limit)
 1971                                 return (-1);
 1972                         offset++;
 1973                         length--;
 1974                 }
 1975         }
 1976 
 1977         /*
 1978          * In 64-bit mode all segments except %fs and %gs have a segment
 1979          * base address of 0.
 1980          */
 1981         if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 1982             seg != VM_REG_GUEST_GS) {
 1983                 segbase = 0;
 1984         } else {
 1985                 segbase = desc->base;
 1986         }
 1987 
 1988         /*
 1989          * Truncate 'firstoff' to the effective address size before adding
 1990          * it to the segment base.
 1991          */
 1992         firstoff &= vie_size2mask(addrsize);
 1993         *gla = (segbase + firstoff) & vie_size2mask(glasize);
 1994         return (0);
 1995 }
 1996 
 1997 /*
 1998  * Prepare a partially decoded vie for a 2nd attempt.
 1999  */
 2000 void
 2001 vie_restart(struct vie *vie)
 2002 {
 2003         _Static_assert(
 2004             offsetof(struct vie, inst) < offsetof(struct vie, vie_startzero) &&
 2005             offsetof(struct vie, num_valid) < offsetof(struct vie, vie_startzero),
 2006             "restart should not erase instruction length or contents");
 2007 
 2008         memset((char *)vie + offsetof(struct vie, vie_startzero), 0,
 2009             sizeof(*vie) - offsetof(struct vie, vie_startzero));
 2010 
 2011         vie->base_register = VM_REG_LAST;
 2012         vie->index_register = VM_REG_LAST;
 2013         vie->segment_register = VM_REG_LAST;
 2014 }
 2015 
 2016 void
 2017 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 2018 {
 2019         KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
 2020             ("%s: invalid instruction length (%d)", __func__, inst_length));
 2021 
 2022         vie_restart(vie);
 2023         memset(vie->inst, 0, sizeof(vie->inst));
 2024         if (inst_length != 0)
 2025                 memcpy(vie->inst, inst_bytes, inst_length);
 2026         vie->num_valid = inst_length;
 2027 }
 2028 
 2029 #ifdef _KERNEL
 2030 static int
 2031 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
 2032 {
 2033         int error_code = 0;
 2034 
 2035         if (pte & PG_V)
 2036                 error_code |= PGEX_P;
 2037         if (prot & VM_PROT_WRITE)
 2038                 error_code |= PGEX_W;
 2039         if (usermode)
 2040                 error_code |= PGEX_U;
 2041         if (rsvd)
 2042                 error_code |= PGEX_RSV;
 2043         if (prot & VM_PROT_EXECUTE)
 2044                 error_code |= PGEX_I;
 2045 
 2046         return (error_code);
 2047 }
 2048 
 2049 static void
 2050 ptp_release(void **cookie)
 2051 {
 2052         if (*cookie != NULL) {
 2053                 vm_gpa_release(*cookie);
 2054                 *cookie = NULL;
 2055         }
 2056 }
 2057 
 2058 static void *
 2059 ptp_hold(struct vcpu *vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
 2060 {
 2061         void *ptr;
 2062 
 2063         ptp_release(cookie);
 2064         ptr = vm_gpa_hold(vcpu, ptpphys, len, VM_PROT_RW, cookie);
 2065         return (ptr);
 2066 }
 2067 
 2068 static int
 2069 _vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
 2070     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only)
 2071 {
 2072         int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 2073         u_int retries;
 2074         uint64_t *ptpbase, ptpphys, pte, pgsize;
 2075         uint32_t *ptpbase32, pte32;
 2076         void *cookie;
 2077 
 2078         *guest_fault = 0;
 2079 
 2080         usermode = (paging->cpl == 3 ? 1 : 0);
 2081         writable = prot & VM_PROT_WRITE;
 2082         cookie = NULL;
 2083         retval = 0;
 2084         retries = 0;
 2085 restart:
 2086         ptpphys = paging->cr3;          /* root of the page tables */
 2087         ptp_release(&cookie);
 2088         if (retries++ > 0)
 2089                 maybe_yield();
 2090 
 2091         if (vie_canonical_check(paging->cpu_mode, gla)) {
 2092                 /*
 2093                  * XXX assuming a non-stack reference otherwise a stack fault
 2094                  * should be generated.
 2095                  */
 2096                 if (!check_only)
 2097                         vm_inject_gp(vcpu);
 2098                 goto fault;
 2099         }
 2100 
 2101         if (paging->paging_mode == PAGING_MODE_FLAT) {
 2102                 *gpa = gla;
 2103                 goto done;
 2104         }
 2105 
 2106         if (paging->paging_mode == PAGING_MODE_32) {
 2107                 nlevels = 2;
 2108                 while (--nlevels >= 0) {
 2109                         /* Zero out the lower 12 bits. */
 2110                         ptpphys &= ~0xfff;
 2111 
 2112                         ptpbase32 = ptp_hold(vcpu, ptpphys, PAGE_SIZE,
 2113                             &cookie);
 2114 
 2115                         if (ptpbase32 == NULL)
 2116                                 goto error;
 2117 
 2118                         ptpshift = PAGE_SHIFT + nlevels * 10;
 2119                         ptpindex = (gla >> ptpshift) & 0x3FF;
 2120                         pgsize = 1UL << ptpshift;
 2121 
 2122                         pte32 = ptpbase32[ptpindex];
 2123 
 2124                         if ((pte32 & PG_V) == 0 ||
 2125                             (usermode && (pte32 & PG_U) == 0) ||
 2126                             (writable && (pte32 & PG_RW) == 0)) {
 2127                                 if (!check_only) {
 2128                                         pfcode = pf_error_code(usermode, prot, 0,
 2129                                             pte32);
 2130                                         vm_inject_pf(vcpu, pfcode, gla);
 2131                                 }
 2132                                 goto fault;
 2133                         }
 2134 
 2135                         /*
 2136                          * Emulate the x86 MMU's management of the accessed
 2137                          * and dirty flags. While the accessed flag is set
 2138                          * at every level of the page table, the dirty flag
 2139                          * is only set at the last level providing the guest
 2140                          * physical address.
 2141                          */
 2142                         if (!check_only && (pte32 & PG_A) == 0) {
 2143                                 if (atomic_cmpset_32(&ptpbase32[ptpindex],
 2144                                     pte32, pte32 | PG_A) == 0) {
 2145                                         goto restart;
 2146                                 }
 2147                         }
 2148 
 2149                         /* XXX must be ignored if CR4.PSE=0 */
 2150                         if (nlevels > 0 && (pte32 & PG_PS) != 0)
 2151                                 break;
 2152 
 2153                         ptpphys = pte32;
 2154                 }
 2155 
 2156                 /* Set the dirty bit in the page table entry if necessary */
 2157                 if (!check_only && writable && (pte32 & PG_M) == 0) {
 2158                         if (atomic_cmpset_32(&ptpbase32[ptpindex],
 2159                             pte32, pte32 | PG_M) == 0) {
 2160                                 goto restart;
 2161                         }
 2162                 }
 2163 
 2164                 /* Zero out the lower 'ptpshift' bits */
 2165                 pte32 >>= ptpshift; pte32 <<= ptpshift;
 2166                 *gpa = pte32 | (gla & (pgsize - 1));
 2167                 goto done;
 2168         }
 2169 
 2170         if (paging->paging_mode == PAGING_MODE_PAE) {
 2171                 /* Zero out the lower 5 bits and the upper 32 bits */
 2172                 ptpphys &= 0xffffffe0UL;
 2173 
 2174                 ptpbase = ptp_hold(vcpu, ptpphys, sizeof(*ptpbase) * 4,
 2175                     &cookie);
 2176                 if (ptpbase == NULL)
 2177                         goto error;
 2178 
 2179                 ptpindex = (gla >> 30) & 0x3;
 2180 
 2181                 pte = ptpbase[ptpindex];
 2182 
 2183                 if ((pte & PG_V) == 0) {
 2184                         if (!check_only) {
 2185                                 pfcode = pf_error_code(usermode, prot, 0, pte);
 2186                                 vm_inject_pf(vcpu, pfcode, gla);
 2187                         }
 2188                         goto fault;
 2189                 }
 2190 
 2191                 ptpphys = pte;
 2192 
 2193                 nlevels = 2;
 2194         } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
 2195                 nlevels = 5;
 2196         } else {
 2197                 nlevels = 4;
 2198         }
 2199 
 2200         while (--nlevels >= 0) {
 2201                 /* Zero out the lower 12 bits and the upper 12 bits */
 2202                 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 2203 
 2204                 ptpbase = ptp_hold(vcpu, ptpphys, PAGE_SIZE, &cookie);
 2205                 if (ptpbase == NULL)
 2206                         goto error;
 2207 
 2208                 ptpshift = PAGE_SHIFT + nlevels * 9;
 2209                 ptpindex = (gla >> ptpshift) & 0x1FF;
 2210                 pgsize = 1UL << ptpshift;
 2211 
 2212                 pte = ptpbase[ptpindex];
 2213 
 2214                 if ((pte & PG_V) == 0 ||
 2215                     (usermode && (pte & PG_U) == 0) ||
 2216                     (writable && (pte & PG_RW) == 0)) {
 2217                         if (!check_only) {
 2218                                 pfcode = pf_error_code(usermode, prot, 0, pte);
 2219                                 vm_inject_pf(vcpu, pfcode, gla);
 2220                         }
 2221                         goto fault;
 2222                 }
 2223 
 2224                 /* Set the accessed bit in the page table entry */
 2225                 if (!check_only && (pte & PG_A) == 0) {
 2226                         if (atomic_cmpset_64(&ptpbase[ptpindex],
 2227                             pte, pte | PG_A) == 0) {
 2228                                 goto restart;
 2229                         }
 2230                 }
 2231 
 2232                 if (nlevels > 0 && (pte & PG_PS) != 0) {
 2233                         if (pgsize > 1 * GB) {
 2234                                 if (!check_only) {
 2235                                         pfcode = pf_error_code(usermode, prot, 1,
 2236                                             pte);
 2237                                         vm_inject_pf(vcpu, pfcode, gla);
 2238                                 }
 2239                                 goto fault;
 2240                         }
 2241                         break;
 2242                 }
 2243 
 2244                 ptpphys = pte;
 2245         }
 2246 
 2247         /* Set the dirty bit in the page table entry if necessary */
 2248         if (!check_only && writable && (pte & PG_M) == 0) {
 2249                 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 2250                         goto restart;
 2251         }
 2252 
 2253         /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
 2254         pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
 2255         *gpa = pte | (gla & (pgsize - 1));
 2256 done:
 2257         ptp_release(&cookie);
 2258         KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
 2259             __func__, retval));
 2260         return (retval);
 2261 error:
 2262         retval = EFAULT;
 2263         goto done;
 2264 fault:
 2265         *guest_fault = 1;
 2266         goto done;
 2267 }
 2268 
 2269 int
 2270 vm_gla2gpa(struct vcpu *vcpu, struct vm_guest_paging *paging,
 2271     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 2272 {
 2273 
 2274         return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
 2275             false));
 2276 }
 2277 
 2278 int
 2279 vm_gla2gpa_nofault(struct vcpu *vcpu, struct vm_guest_paging *paging,
 2280     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 2281 {
 2282 
 2283         return (_vm_gla2gpa(vcpu, paging, gla, prot, gpa, guest_fault,
 2284             true));
 2285 }
 2286 
 2287 int
 2288 vmm_fetch_instruction(struct vcpu *vcpu, struct vm_guest_paging *paging,
 2289     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 2290 {
 2291         struct vm_copyinfo copyinfo[2];
 2292         int error, prot;
 2293 
 2294         if (inst_length > VIE_INST_SIZE)
 2295                 panic("vmm_fetch_instruction: invalid length %d", inst_length);
 2296 
 2297         prot = PROT_READ | PROT_EXEC;
 2298         error = vm_copy_setup(vcpu, paging, rip, inst_length, prot,
 2299             copyinfo, nitems(copyinfo), faultptr);
 2300         if (error || *faultptr)
 2301                 return (error);
 2302 
 2303         vm_copyin(copyinfo, vie->inst, inst_length);
 2304         vm_copy_teardown(copyinfo, nitems(copyinfo));
 2305         vie->num_valid = inst_length;
 2306         return (0);
 2307 }
 2308 #endif  /* _KERNEL */
 2309 
 2310 static int
 2311 vie_peek(struct vie *vie, uint8_t *x)
 2312 {
 2313 
 2314         if (vie->num_processed < vie->num_valid) {
 2315                 *x = vie->inst[vie->num_processed];
 2316                 return (0);
 2317         } else
 2318                 return (-1);
 2319 }
 2320 
 2321 static void
 2322 vie_advance(struct vie *vie)
 2323 {
 2324 
 2325         vie->num_processed++;
 2326 }
 2327 
 2328 static bool
 2329 segment_override(uint8_t x, int *seg)
 2330 {
 2331 
 2332         switch (x) {
 2333         case 0x2E:
 2334                 *seg = VM_REG_GUEST_CS;
 2335                 break;
 2336         case 0x36:
 2337                 *seg = VM_REG_GUEST_SS;
 2338                 break;
 2339         case 0x3E:
 2340                 *seg = VM_REG_GUEST_DS;
 2341                 break;
 2342         case 0x26:
 2343                 *seg = VM_REG_GUEST_ES;
 2344                 break;
 2345         case 0x64:
 2346                 *seg = VM_REG_GUEST_FS;
 2347                 break;
 2348         case 0x65:
 2349                 *seg = VM_REG_GUEST_GS;
 2350                 break;
 2351         default:
 2352                 return (false);
 2353         }
 2354         return (true);
 2355 }
 2356 
 2357 static int
 2358 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 2359 {
 2360         uint8_t x;
 2361 
 2362         while (1) {
 2363                 if (vie_peek(vie, &x))
 2364                         return (-1);
 2365 
 2366                 if (x == 0x66)
 2367                         vie->opsize_override = 1;
 2368                 else if (x == 0x67)
 2369                         vie->addrsize_override = 1;
 2370                 else if (x == 0xF3)
 2371                         vie->repz_present = 1;
 2372                 else if (x == 0xF2)
 2373                         vie->repnz_present = 1;
 2374                 else if (segment_override(x, &vie->segment_register))
 2375                         vie->segment_override = 1;
 2376                 else
 2377                         break;
 2378 
 2379                 vie_advance(vie);
 2380         }
 2381 
 2382         /*
 2383          * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
 2384          * - Only one REX prefix is allowed per instruction.
 2385          * - The REX prefix must immediately precede the opcode byte or the
 2386          *   escape opcode byte.
 2387          * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
 2388          *   the mandatory prefix must come before the REX prefix.
 2389          */
 2390         if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
 2391                 vie->rex_present = 1;
 2392                 vie->rex_w = x & 0x8 ? 1 : 0;
 2393                 vie->rex_r = x & 0x4 ? 1 : 0;
 2394                 vie->rex_x = x & 0x2 ? 1 : 0;
 2395                 vie->rex_b = x & 0x1 ? 1 : 0;
 2396                 vie_advance(vie);
 2397         }
 2398 
 2399         /*
 2400          * § 2.3.5, "The VEX Prefix", SDM Vol 2.
 2401          */
 2402         if ((cpu_mode == CPU_MODE_64BIT || cpu_mode == CPU_MODE_COMPATIBILITY)
 2403             && x == 0xC4) {
 2404                 const struct vie_op *optab;
 2405 
 2406                 /* 3-byte VEX prefix. */
 2407                 vie->vex_present = 1;
 2408 
 2409                 vie_advance(vie);
 2410                 if (vie_peek(vie, &x))
 2411                         return (-1);
 2412 
 2413                 /*
 2414                  * 2nd byte: [R', X', B', mmmmm[4:0]].  Bits are inverted
 2415                  * relative to REX encoding.
 2416                  */
 2417                 vie->rex_r = x & 0x80 ? 0 : 1;
 2418                 vie->rex_x = x & 0x40 ? 0 : 1;
 2419                 vie->rex_b = x & 0x20 ? 0 : 1;
 2420 
 2421                 switch (x & 0x1F) {
 2422                 case 0x2:
 2423                         /* 0F 38. */
 2424                         optab = three_byte_opcodes_0f38;
 2425                         break;
 2426                 case 0x1:
 2427                         /* 0F class - nothing handled here yet. */
 2428                         /* FALLTHROUGH */
 2429                 case 0x3:
 2430                         /* 0F 3A class - nothing handled here yet. */
 2431                         /* FALLTHROUGH */
 2432                 default:
 2433                         /* Reserved (#UD). */
 2434                         return (-1);
 2435                 }
 2436 
 2437                 vie_advance(vie);
 2438                 if (vie_peek(vie, &x))
 2439                         return (-1);
 2440 
 2441                 /* 3rd byte: [W, vvvv[6:3], L, pp[1:0]]. */
 2442                 vie->rex_w = x & 0x80 ? 1 : 0;
 2443 
 2444                 vie->vex_reg = ((~(unsigned)x & 0x78u) >> 3);
 2445                 vie->vex_l = !!(x & 0x4);
 2446                 vie->vex_pp = (x & 0x3);
 2447 
 2448                 /* PP: 1=66 2=F3 3=F2 prefixes. */
 2449                 switch (vie->vex_pp) {
 2450                 case 0x1:
 2451                         vie->opsize_override = 1;
 2452                         break;
 2453                 case 0x2:
 2454                         vie->repz_present = 1;
 2455                         break;
 2456                 case 0x3:
 2457                         vie->repnz_present = 1;
 2458                         break;
 2459                 }
 2460 
 2461                 vie_advance(vie);
 2462 
 2463                 /* Opcode, sans literal prefix prefix. */
 2464                 if (vie_peek(vie, &x))
 2465                         return (-1);
 2466 
 2467                 vie->op = optab[x];
 2468                 if (vie->op.op_type == VIE_OP_TYPE_NONE)
 2469                         return (-1);
 2470 
 2471                 vie_advance(vie);
 2472         }
 2473 
 2474         /*
 2475          * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
 2476          */
 2477         if (cpu_mode == CPU_MODE_64BIT) {
 2478                 /*
 2479                  * Default address size is 64-bits and default operand size
 2480                  * is 32-bits.
 2481                  */
 2482                 vie->addrsize = vie->addrsize_override ? 4 : 8;
 2483                 if (vie->rex_w)
 2484                         vie->opsize = 8;
 2485                 else if (vie->opsize_override)
 2486                         vie->opsize = 2;
 2487                 else
 2488                         vie->opsize = 4;
 2489         } else if (cs_d) {
 2490                 /* Default address and operand sizes are 32-bits */
 2491                 vie->addrsize = vie->addrsize_override ? 2 : 4;
 2492                 vie->opsize = vie->opsize_override ? 2 : 4;
 2493         } else {
 2494                 /* Default address and operand sizes are 16-bits */
 2495                 vie->addrsize = vie->addrsize_override ? 4 : 2;
 2496                 vie->opsize = vie->opsize_override ? 4 : 2;
 2497         }
 2498         return (0);
 2499 }
 2500 
 2501 static int
 2502 decode_two_byte_opcode(struct vie *vie)
 2503 {
 2504         uint8_t x;
 2505 
 2506         if (vie_peek(vie, &x))
 2507                 return (-1);
 2508 
 2509         vie->op = two_byte_opcodes[x];
 2510 
 2511         if (vie->op.op_type == VIE_OP_TYPE_NONE)
 2512                 return (-1);
 2513 
 2514         vie_advance(vie);
 2515         return (0);
 2516 }
 2517 
 2518 static int
 2519 decode_opcode(struct vie *vie)
 2520 {
 2521         uint8_t x;
 2522 
 2523         if (vie_peek(vie, &x))
 2524                 return (-1);
 2525 
 2526         /* Already did this via VEX prefix. */
 2527         if (vie->op.op_type != VIE_OP_TYPE_NONE)
 2528                 return (0);
 2529 
 2530         vie->op = one_byte_opcodes[x];
 2531 
 2532         if (vie->op.op_type == VIE_OP_TYPE_NONE)
 2533                 return (-1);
 2534 
 2535         vie_advance(vie);
 2536 
 2537         if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
 2538                 return (decode_two_byte_opcode(vie));
 2539 
 2540         return (0);
 2541 }
 2542 
 2543 static int
 2544 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 2545 {
 2546         uint8_t x;
 2547 
 2548         if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
 2549                 return (0);
 2550 
 2551         if (cpu_mode == CPU_MODE_REAL)
 2552                 return (-1);
 2553 
 2554         if (vie_peek(vie, &x))
 2555                 return (-1);
 2556 
 2557         vie->mod = (x >> 6) & 0x3;
 2558         vie->rm =  (x >> 0) & 0x7;
 2559         vie->reg = (x >> 3) & 0x7;
 2560 
 2561         /*
 2562          * A direct addressing mode makes no sense in the context of an EPT
 2563          * fault. There has to be a memory access involved to cause the
 2564          * EPT fault.
 2565          */
 2566         if (vie->mod == VIE_MOD_DIRECT)
 2567                 return (-1);
 2568 
 2569         if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 2570             (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
 2571                 /*
 2572                  * Table 2-5: Special Cases of REX Encodings
 2573                  *
 2574                  * mod=0, r/m=5 is used in the compatibility mode to
 2575                  * indicate a disp32 without a base register.
 2576                  *
 2577                  * mod!=3, r/m=4 is used in the compatibility mode to
 2578                  * indicate that the SIB byte is present.
 2579                  *
 2580                  * The 'b' bit in the REX prefix is don't care in
 2581                  * this case.
 2582                  */
 2583         } else {
 2584                 vie->rm |= (vie->rex_b << 3);
 2585         }
 2586 
 2587         vie->reg |= (vie->rex_r << 3);
 2588 
 2589         /* SIB */
 2590         if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
 2591                 goto done;
 2592 
 2593         vie->base_register = gpr_map[vie->rm];
 2594 
 2595         switch (vie->mod) {
 2596         case VIE_MOD_INDIRECT_DISP8:
 2597                 vie->disp_bytes = 1;
 2598                 break;
 2599         case VIE_MOD_INDIRECT_DISP32:
 2600                 vie->disp_bytes = 4;
 2601                 break;
 2602         case VIE_MOD_INDIRECT:
 2603                 if (vie->rm == VIE_RM_DISP32) {
 2604                         vie->disp_bytes = 4;
 2605                         /*
 2606                          * Table 2-7. RIP-Relative Addressing
 2607                          *
 2608                          * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
 2609                          * whereas in compatibility mode it just implies disp32.
 2610                          */
 2611 
 2612                         if (cpu_mode == CPU_MODE_64BIT)
 2613                                 vie->base_register = VM_REG_GUEST_RIP;
 2614                         else
 2615                                 vie->base_register = VM_REG_LAST;
 2616                 }
 2617                 break;
 2618         }
 2619 
 2620 done:
 2621         vie_advance(vie);
 2622 
 2623         return (0);
 2624 }
 2625 
 2626 static int
 2627 decode_sib(struct vie *vie)
 2628 {
 2629         uint8_t x;
 2630 
 2631         /* Proceed only if SIB byte is present */
 2632         if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
 2633                 return (0);
 2634 
 2635         if (vie_peek(vie, &x))
 2636                 return (-1);
 2637 
 2638         /* De-construct the SIB byte */
 2639         vie->ss = (x >> 6) & 0x3;
 2640         vie->index = (x >> 3) & 0x7;
 2641         vie->base = (x >> 0) & 0x7;
 2642 
 2643         /* Apply the REX prefix modifiers */
 2644         vie->index |= vie->rex_x << 3;
 2645         vie->base |= vie->rex_b << 3;
 2646 
 2647         switch (vie->mod) {
 2648         case VIE_MOD_INDIRECT_DISP8:
 2649                 vie->disp_bytes = 1;
 2650                 break;
 2651         case VIE_MOD_INDIRECT_DISP32:
 2652                 vie->disp_bytes = 4;
 2653                 break;
 2654         }
 2655 
 2656         if (vie->mod == VIE_MOD_INDIRECT &&
 2657             (vie->base == 5 || vie->base == 13)) {
 2658                 /*
 2659                  * Special case when base register is unused if mod = 0
 2660                  * and base = %rbp or %r13.
 2661                  *
 2662                  * Documented in:
 2663                  * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 2664                  * Table 2-5: Special Cases of REX Encodings
 2665                  */
 2666                 vie->disp_bytes = 4;
 2667         } else {
 2668                 vie->base_register = gpr_map[vie->base];
 2669         }
 2670 
 2671         /*
 2672          * All encodings of 'index' are valid except for %rsp (4).
 2673          *
 2674          * Documented in:
 2675          * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 2676          * Table 2-5: Special Cases of REX Encodings
 2677          */
 2678         if (vie->index != 4)
 2679                 vie->index_register = gpr_map[vie->index];
 2680 
 2681         /* 'scale' makes sense only in the context of an index register */
 2682         if (vie->index_register < VM_REG_LAST)
 2683                 vie->scale = 1 << vie->ss;
 2684 
 2685         vie_advance(vie);
 2686 
 2687         return (0);
 2688 }
 2689 
 2690 static int
 2691 decode_displacement(struct vie *vie)
 2692 {
 2693         int n, i;
 2694         uint8_t x;
 2695 
 2696         union {
 2697                 char    buf[4];
 2698                 int8_t  signed8;
 2699                 int32_t signed32;
 2700         } u;
 2701 
 2702         if ((n = vie->disp_bytes) == 0)
 2703                 return (0);
 2704 
 2705         if (n != 1 && n != 4)
 2706                 panic("decode_displacement: invalid disp_bytes %d", n);
 2707 
 2708         for (i = 0; i < n; i++) {
 2709                 if (vie_peek(vie, &x))
 2710                         return (-1);
 2711 
 2712                 u.buf[i] = x;
 2713                 vie_advance(vie);
 2714         }
 2715 
 2716         if (n == 1)
 2717                 vie->displacement = u.signed8;          /* sign-extended */
 2718         else
 2719                 vie->displacement = u.signed32;         /* sign-extended */
 2720 
 2721         return (0);
 2722 }
 2723 
 2724 static int
 2725 decode_immediate(struct vie *vie)
 2726 {
 2727         int i, n;
 2728         uint8_t x;
 2729         union {
 2730                 char    buf[4];
 2731                 int8_t  signed8;
 2732                 int16_t signed16;
 2733                 int32_t signed32;
 2734         } u;
 2735 
 2736         /* Figure out immediate operand size (if any) */
 2737         if (vie->op.op_flags & VIE_OP_F_IMM) {
 2738                 /*
 2739                  * Section 2.2.1.5 "Immediates", Intel SDM:
 2740                  * In 64-bit mode the typical size of immediate operands
 2741                  * remains 32-bits. When the operand size if 64-bits, the
 2742                  * processor sign-extends all immediates to 64-bits prior
 2743                  * to their use.
 2744                  */
 2745                 if (vie->opsize == 4 || vie->opsize == 8)
 2746                         vie->imm_bytes = 4;
 2747                 else
 2748                         vie->imm_bytes = 2;
 2749         } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 2750                 vie->imm_bytes = 1;
 2751         }
 2752 
 2753         if ((n = vie->imm_bytes) == 0)
 2754                 return (0);
 2755 
 2756         KASSERT(n == 1 || n == 2 || n == 4,
 2757             ("%s: invalid number of immediate bytes: %d", __func__, n));
 2758 
 2759         for (i = 0; i < n; i++) {
 2760                 if (vie_peek(vie, &x))
 2761                         return (-1);
 2762 
 2763                 u.buf[i] = x;
 2764                 vie_advance(vie);
 2765         }
 2766 
 2767         /* sign-extend the immediate value before use */
 2768         if (n == 1)
 2769                 vie->immediate = u.signed8;
 2770         else if (n == 2)
 2771                 vie->immediate = u.signed16;
 2772         else
 2773                 vie->immediate = u.signed32;
 2774 
 2775         return (0);
 2776 }
 2777 
 2778 static int
 2779 decode_moffset(struct vie *vie)
 2780 {
 2781         int i, n;
 2782         uint8_t x;
 2783         union {
 2784                 char    buf[8];
 2785                 uint64_t u64;
 2786         } u;
 2787 
 2788         if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
 2789                 return (0);
 2790 
 2791         /*
 2792          * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
 2793          * The memory offset size follows the address-size of the instruction.
 2794          */
 2795         n = vie->addrsize;
 2796         KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
 2797 
 2798         u.u64 = 0;
 2799         for (i = 0; i < n; i++) {
 2800                 if (vie_peek(vie, &x))
 2801                         return (-1);
 2802 
 2803                 u.buf[i] = x;
 2804                 vie_advance(vie);
 2805         }
 2806         vie->displacement = u.u64;
 2807         return (0);
 2808 }
 2809 
 2810 #ifdef _KERNEL
 2811 /*
 2812  * Verify that the 'guest linear address' provided as collateral of the nested
 2813  * page table fault matches with our instruction decoding.
 2814  */
 2815 static int
 2816 verify_gla(struct vcpu *vcpu, uint64_t gla, struct vie *vie,
 2817     enum vm_cpu_mode cpu_mode)
 2818 {
 2819         int error;
 2820         uint64_t base, segbase, idx, gla2;
 2821         enum vm_reg_name seg;
 2822         struct seg_desc desc;
 2823 
 2824         /* Skip 'gla' verification */
 2825         if (gla == VIE_INVALID_GLA)
 2826                 return (0);
 2827 
 2828         base = 0;
 2829         if (vie->base_register != VM_REG_LAST) {
 2830                 error = vm_get_register(vcpu, vie->base_register, &base);
 2831                 if (error) {
 2832                         printf("verify_gla: error %d getting base reg %d\n",
 2833                                 error, vie->base_register);
 2834                         return (-1);
 2835                 }
 2836 
 2837                 /*
 2838                  * RIP-relative addressing starts from the following
 2839                  * instruction
 2840                  */
 2841                 if (vie->base_register == VM_REG_GUEST_RIP)
 2842                         base += vie->num_processed;
 2843         }
 2844 
 2845         idx = 0;
 2846         if (vie->index_register != VM_REG_LAST) {
 2847                 error = vm_get_register(vcpu, vie->index_register, &idx);
 2848                 if (error) {
 2849                         printf("verify_gla: error %d getting index reg %d\n",
 2850                                 error, vie->index_register);
 2851                         return (-1);
 2852                 }
 2853         }
 2854 
 2855         /*
 2856          * From "Specifying a Segment Selector", Intel SDM, Vol 1
 2857          *
 2858          * In 64-bit mode, segmentation is generally (but not
 2859          * completely) disabled.  The exceptions are the FS and GS
 2860          * segments.
 2861          *
 2862          * In legacy IA-32 mode, when the ESP or EBP register is used
 2863          * as the base, the SS segment is the default segment.  For
 2864          * other data references, except when relative to stack or
 2865          * string destination the DS segment is the default.  These
 2866          * can be overridden to allow other segments to be accessed.
 2867          */
 2868         if (vie->segment_override)
 2869                 seg = vie->segment_register;
 2870         else if (vie->base_register == VM_REG_GUEST_RSP ||
 2871             vie->base_register == VM_REG_GUEST_RBP)
 2872                 seg = VM_REG_GUEST_SS;
 2873         else
 2874                 seg = VM_REG_GUEST_DS;
 2875         if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 2876             seg != VM_REG_GUEST_GS) {
 2877                 segbase = 0;
 2878         } else {
 2879                 error = vm_get_seg_desc(vcpu, seg, &desc);
 2880                 if (error) {
 2881                         printf("verify_gla: error %d getting segment"
 2882                                " descriptor %d", error,
 2883                                vie->segment_register);
 2884                         return (-1);
 2885                 }
 2886                 segbase = desc.base;
 2887         }
 2888 
 2889         gla2 = segbase + base + vie->scale * idx + vie->displacement;
 2890         gla2 &= size2mask[vie->addrsize];
 2891         if (gla != gla2) {
 2892                 printf("verify_gla mismatch: segbase(0x%0lx)"
 2893                        "base(0x%0lx), scale(%d), index(0x%0lx), "
 2894                        "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
 2895                        segbase, base, vie->scale, idx, vie->displacement,
 2896                        gla, gla2);
 2897                 return (-1);
 2898         }
 2899 
 2900         return (0);
 2901 }
 2902 #endif  /* _KERNEL */
 2903 
 2904 int
 2905 #ifdef _KERNEL
 2906 vmm_decode_instruction(struct vcpu *vcpu, uint64_t gla,
 2907                        enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 2908 #else
 2909 vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 2910 #endif
 2911 {
 2912 
 2913         if (decode_prefixes(vie, cpu_mode, cs_d))
 2914                 return (-1);
 2915 
 2916         if (decode_opcode(vie))
 2917                 return (-1);
 2918 
 2919         if (decode_modrm(vie, cpu_mode))
 2920                 return (-1);
 2921 
 2922         if (decode_sib(vie))
 2923                 return (-1);
 2924 
 2925         if (decode_displacement(vie))
 2926                 return (-1);
 2927 
 2928         if (decode_immediate(vie))
 2929                 return (-1);
 2930 
 2931         if (decode_moffset(vie))
 2932                 return (-1);
 2933 
 2934 #ifdef _KERNEL
 2935         if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 2936                 if (verify_gla(vcpu, gla, vie, cpu_mode))
 2937                         return (-1);
 2938         }
 2939 #endif
 2940 
 2941         vie->decoded = 1;       /* success */
 2942 
 2943         return (0);
 2944 }

Cache object: 3fcf2af1f40df1f5fd35e459cb910d06


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.