The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/vmm_instruction_emul.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2012 Sandvine, Inc.
    3  * Copyright (c) 2012 NetApp, Inc.
    4  * All rights reserved.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  *
   27  * $FreeBSD: releng/11.0/sys/amd64/vmm/vmm_instruction_emul.c 299009 2016-05-03 22:07:18Z pfg $
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 __FBSDID("$FreeBSD: releng/11.0/sys/amd64/vmm/vmm_instruction_emul.c 299009 2016-05-03 22:07:18Z pfg $");
   32 
   33 #ifdef _KERNEL
   34 #include <sys/param.h>
   35 #include <sys/pcpu.h>
   36 #include <sys/systm.h>
   37 #include <sys/proc.h>
   38 
   39 #include <vm/vm.h>
   40 #include <vm/pmap.h>
   41 
   42 #include <machine/vmparam.h>
   43 #include <machine/vmm.h>
   44 #else   /* !_KERNEL */
   45 #include <sys/types.h>
   46 #include <sys/errno.h>
   47 #include <sys/_iovec.h>
   48 
   49 #include <machine/vmm.h>
   50 
   51 #include <assert.h>
   52 #include <vmmapi.h>
   53 #define KASSERT(exp,msg)        assert((exp))
   54 #endif  /* _KERNEL */
   55 
   56 #include <machine/vmm_instruction_emul.h>
   57 #include <x86/psl.h>
   58 #include <x86/specialreg.h>
   59 
   60 /* struct vie_op.op_type */
   61 enum {
   62         VIE_OP_TYPE_NONE = 0,
   63         VIE_OP_TYPE_MOV,
   64         VIE_OP_TYPE_MOVSX,
   65         VIE_OP_TYPE_MOVZX,
   66         VIE_OP_TYPE_AND,
   67         VIE_OP_TYPE_OR,
   68         VIE_OP_TYPE_SUB,
   69         VIE_OP_TYPE_TWO_BYTE,
   70         VIE_OP_TYPE_PUSH,
   71         VIE_OP_TYPE_CMP,
   72         VIE_OP_TYPE_POP,
   73         VIE_OP_TYPE_MOVS,
   74         VIE_OP_TYPE_GROUP1,
   75         VIE_OP_TYPE_STOS,
   76         VIE_OP_TYPE_BITTEST,
   77         VIE_OP_TYPE_LAST
   78 };
   79 
   80 /* struct vie_op.op_flags */
   81 #define VIE_OP_F_IMM            (1 << 0)  /* 16/32-bit immediate operand */
   82 #define VIE_OP_F_IMM8           (1 << 1)  /* 8-bit immediate operand */
   83 #define VIE_OP_F_MOFFSET        (1 << 2)  /* 16/32/64-bit immediate moffset */
   84 #define VIE_OP_F_NO_MODRM       (1 << 3)
   85 #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
   86 
   87 static const struct vie_op two_byte_opcodes[256] = {
   88         [0xB6] = {
   89                 .op_byte = 0xB6,
   90                 .op_type = VIE_OP_TYPE_MOVZX,
   91         },
   92         [0xB7] = {
   93                 .op_byte = 0xB7,
   94                 .op_type = VIE_OP_TYPE_MOVZX,
   95         },
   96         [0xBA] = {
   97                 .op_byte = 0xBA,
   98                 .op_type = VIE_OP_TYPE_BITTEST,
   99                 .op_flags = VIE_OP_F_IMM8,
  100         },
  101         [0xBE] = {
  102                 .op_byte = 0xBE,
  103                 .op_type = VIE_OP_TYPE_MOVSX,
  104         },
  105 };
  106 
  107 static const struct vie_op one_byte_opcodes[256] = {
  108         [0x0F] = {
  109                 .op_byte = 0x0F,
  110                 .op_type = VIE_OP_TYPE_TWO_BYTE
  111         },
  112         [0x2B] = {
  113                 .op_byte = 0x2B,
  114                 .op_type = VIE_OP_TYPE_SUB,
  115         },
  116         [0x39] = {
  117                 .op_byte = 0x39,
  118                 .op_type = VIE_OP_TYPE_CMP,
  119         },
  120         [0x3B] = {
  121                 .op_byte = 0x3B,
  122                 .op_type = VIE_OP_TYPE_CMP,
  123         },
  124         [0x88] = {
  125                 .op_byte = 0x88,
  126                 .op_type = VIE_OP_TYPE_MOV,
  127         },
  128         [0x89] = {
  129                 .op_byte = 0x89,
  130                 .op_type = VIE_OP_TYPE_MOV,
  131         },
  132         [0x8A] = {
  133                 .op_byte = 0x8A,
  134                 .op_type = VIE_OP_TYPE_MOV,
  135         },
  136         [0x8B] = {
  137                 .op_byte = 0x8B,
  138                 .op_type = VIE_OP_TYPE_MOV,
  139         },
  140         [0xA1] = {
  141                 .op_byte = 0xA1,
  142                 .op_type = VIE_OP_TYPE_MOV,
  143                 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
  144         },
  145         [0xA3] = {
  146                 .op_byte = 0xA3,
  147                 .op_type = VIE_OP_TYPE_MOV,
  148                 .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
  149         },
  150         [0xA4] = {
  151                 .op_byte = 0xA4,
  152                 .op_type = VIE_OP_TYPE_MOVS,
  153                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  154         },
  155         [0xA5] = {
  156                 .op_byte = 0xA5,
  157                 .op_type = VIE_OP_TYPE_MOVS,
  158                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  159         },
  160         [0xAA] = {
  161                 .op_byte = 0xAA,
  162                 .op_type = VIE_OP_TYPE_STOS,
  163                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  164         },
  165         [0xAB] = {
  166                 .op_byte = 0xAB,
  167                 .op_type = VIE_OP_TYPE_STOS,
  168                 .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
  169         },
  170         [0xC6] = {
  171                 /* XXX Group 11 extended opcode - not just MOV */
  172                 .op_byte = 0xC6,
  173                 .op_type = VIE_OP_TYPE_MOV,
  174                 .op_flags = VIE_OP_F_IMM8,
  175         },
  176         [0xC7] = {
  177                 .op_byte = 0xC7,
  178                 .op_type = VIE_OP_TYPE_MOV,
  179                 .op_flags = VIE_OP_F_IMM,
  180         },
  181         [0x23] = {
  182                 .op_byte = 0x23,
  183                 .op_type = VIE_OP_TYPE_AND,
  184         },
  185         [0x80] = {
  186                 /* Group 1 extended opcode */
  187                 .op_byte = 0x80,
  188                 .op_type = VIE_OP_TYPE_GROUP1,
  189                 .op_flags = VIE_OP_F_IMM8,
  190         },
  191         [0x81] = {
  192                 /* Group 1 extended opcode */
  193                 .op_byte = 0x81,
  194                 .op_type = VIE_OP_TYPE_GROUP1,
  195                 .op_flags = VIE_OP_F_IMM,
  196         },
  197         [0x83] = {
  198                 /* Group 1 extended opcode */
  199                 .op_byte = 0x83,
  200                 .op_type = VIE_OP_TYPE_GROUP1,
  201                 .op_flags = VIE_OP_F_IMM8,
  202         },
  203         [0x8F] = {
  204                 /* XXX Group 1A extended opcode - not just POP */
  205                 .op_byte = 0x8F,
  206                 .op_type = VIE_OP_TYPE_POP,
  207         },
  208         [0xFF] = {
  209                 /* XXX Group 5 extended opcode - not just PUSH */
  210                 .op_byte = 0xFF,
  211                 .op_type = VIE_OP_TYPE_PUSH,
  212         }
  213 };
  214 
  215 /* struct vie.mod */
  216 #define VIE_MOD_INDIRECT                0
  217 #define VIE_MOD_INDIRECT_DISP8          1
  218 #define VIE_MOD_INDIRECT_DISP32         2
  219 #define VIE_MOD_DIRECT                  3
  220 
  221 /* struct vie.rm */
  222 #define VIE_RM_SIB                      4
  223 #define VIE_RM_DISP32                   5
  224 
  225 #define GB                              (1024 * 1024 * 1024)
  226 
  227 static enum vm_reg_name gpr_map[16] = {
  228         VM_REG_GUEST_RAX,
  229         VM_REG_GUEST_RCX,
  230         VM_REG_GUEST_RDX,
  231         VM_REG_GUEST_RBX,
  232         VM_REG_GUEST_RSP,
  233         VM_REG_GUEST_RBP,
  234         VM_REG_GUEST_RSI,
  235         VM_REG_GUEST_RDI,
  236         VM_REG_GUEST_R8,
  237         VM_REG_GUEST_R9,
  238         VM_REG_GUEST_R10,
  239         VM_REG_GUEST_R11,
  240         VM_REG_GUEST_R12,
  241         VM_REG_GUEST_R13,
  242         VM_REG_GUEST_R14,
  243         VM_REG_GUEST_R15
  244 };
  245 
  246 static uint64_t size2mask[] = {
  247         [1] = 0xff,
  248         [2] = 0xffff,
  249         [4] = 0xffffffff,
  250         [8] = 0xffffffffffffffff,
  251 };
  252 
  253 static int
  254 vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
  255 {
  256         int error;
  257 
  258         error = vm_get_register(vm, vcpuid, reg, rval);
  259 
  260         return (error);
  261 }
  262 
  263 static void
  264 vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
  265 {
  266         *lhbr = 0;
  267         *reg = gpr_map[vie->reg];
  268 
  269         /*
  270          * 64-bit mode imposes limitations on accessing legacy high byte
  271          * registers (lhbr).
  272          *
  273          * The legacy high-byte registers cannot be addressed if the REX
  274          * prefix is present. In this case the values 4, 5, 6 and 7 of the
  275          * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
  276          *
  277          * If the REX prefix is not present then the values 4, 5, 6 and 7
  278          * of the 'ModRM:reg' field address the legacy high-byte registers,
  279          * %ah, %ch, %dh and %bh respectively.
  280          */
  281         if (!vie->rex_present) {
  282                 if (vie->reg & 0x4) {
  283                         *lhbr = 1;
  284                         *reg = gpr_map[vie->reg & 0x3];
  285                 }
  286         }
  287 }
  288 
  289 static int
  290 vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
  291 {
  292         uint64_t val;
  293         int error, lhbr;
  294         enum vm_reg_name reg;
  295 
  296         vie_calc_bytereg(vie, &reg, &lhbr);
  297         error = vm_get_register(vm, vcpuid, reg, &val);
  298 
  299         /*
  300          * To obtain the value of a legacy high byte register shift the
  301          * base register right by 8 bits (%ah = %rax >> 8).
  302          */
  303         if (lhbr)
  304                 *rval = val >> 8;
  305         else
  306                 *rval = val;
  307         return (error);
  308 }
  309 
  310 static int
  311 vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
  312 {
  313         uint64_t origval, val, mask;
  314         int error, lhbr;
  315         enum vm_reg_name reg;
  316 
  317         vie_calc_bytereg(vie, &reg, &lhbr);
  318         error = vm_get_register(vm, vcpuid, reg, &origval);
  319         if (error == 0) {
  320                 val = byte;
  321                 mask = 0xff;
  322                 if (lhbr) {
  323                         /*
  324                          * Shift left by 8 to store 'byte' in a legacy high
  325                          * byte register.
  326                          */
  327                         val <<= 8;
  328                         mask <<= 8;
  329                 }
  330                 val |= origval & ~mask;
  331                 error = vm_set_register(vm, vcpuid, reg, val);
  332         }
  333         return (error);
  334 }
  335 
  336 int
  337 vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
  338                     uint64_t val, int size)
  339 {
  340         int error;
  341         uint64_t origval;
  342 
  343         switch (size) {
  344         case 1:
  345         case 2:
  346                 error = vie_read_register(vm, vcpuid, reg, &origval);
  347                 if (error)
  348                         return (error);
  349                 val &= size2mask[size];
  350                 val |= origval & ~size2mask[size];
  351                 break;
  352         case 4:
  353                 val &= 0xffffffffUL;
  354                 break;
  355         case 8:
  356                 break;
  357         default:
  358                 return (EINVAL);
  359         }
  360 
  361         error = vm_set_register(vm, vcpuid, reg, val);
  362         return (error);
  363 }
  364 
  365 #define RFLAGS_STATUS_BITS    (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
  366 
  367 /*
  368  * Return the status flags that would result from doing (x - y).
  369  */
  370 #define GETCC(sz)                                                       \
  371 static u_long                                                           \
  372 getcc##sz(uint##sz##_t x, uint##sz##_t y)                               \
  373 {                                                                       \
  374         u_long rflags;                                                  \
  375                                                                         \
  376         __asm __volatile("sub %2,%1; pushfq; popq %0" :                 \
  377             "=r" (rflags), "+r" (x) : "m" (y));                         \
  378         return (rflags);                                                \
  379 } struct __hack
  380 
  381 GETCC(8);
  382 GETCC(16);
  383 GETCC(32);
  384 GETCC(64);
  385 
  386 static u_long
  387 getcc(int opsize, uint64_t x, uint64_t y)
  388 {
  389         KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
  390             ("getcc: invalid operand size %d", opsize));
  391 
  392         if (opsize == 1)
  393                 return (getcc8(x, y));
  394         else if (opsize == 2)
  395                 return (getcc16(x, y));
  396         else if (opsize == 4)
  397                 return (getcc32(x, y));
  398         else
  399                 return (getcc64(x, y));
  400 }
  401 
  402 static int
  403 emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  404             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
  405 {
  406         int error, size;
  407         enum vm_reg_name reg;
  408         uint8_t byte;
  409         uint64_t val;
  410 
  411         size = vie->opsize;
  412         error = EINVAL;
  413 
  414         switch (vie->op.op_byte) {
  415         case 0x88:
  416                 /*
  417                  * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
  418                  * 88/r:        mov r/m8, r8
  419                  * REX + 88/r:  mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
  420                  */
  421                 size = 1;       /* override for byte operation */
  422                 error = vie_read_bytereg(vm, vcpuid, vie, &byte);
  423                 if (error == 0)
  424                         error = memwrite(vm, vcpuid, gpa, byte, size, arg);
  425                 break;
  426         case 0x89:
  427                 /*
  428                  * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
  429                  * 89/r:        mov r/m16, r16
  430                  * 89/r:        mov r/m32, r32
  431                  * REX.W + 89/r mov r/m64, r64
  432                  */
  433                 reg = gpr_map[vie->reg];
  434                 error = vie_read_register(vm, vcpuid, reg, &val);
  435                 if (error == 0) {
  436                         val &= size2mask[size];
  437                         error = memwrite(vm, vcpuid, gpa, val, size, arg);
  438                 }
  439                 break;
  440         case 0x8A:
  441                 /*
  442                  * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
  443                  * 8A/r:        mov r8, r/m8
  444                  * REX + 8A/r:  mov r8, r/m8
  445                  */
  446                 size = 1;       /* override for byte operation */
  447                 error = memread(vm, vcpuid, gpa, &val, size, arg);
  448                 if (error == 0)
  449                         error = vie_write_bytereg(vm, vcpuid, vie, val);
  450                 break;
  451         case 0x8B:
  452                 /*
  453                  * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
  454                  * 8B/r:        mov r16, r/m16
  455                  * 8B/r:        mov r32, r/m32
  456                  * REX.W 8B/r:  mov r64, r/m64
  457                  */
  458                 error = memread(vm, vcpuid, gpa, &val, size, arg);
  459                 if (error == 0) {
  460                         reg = gpr_map[vie->reg];
  461                         error = vie_update_register(vm, vcpuid, reg, val, size);
  462                 }
  463                 break;
  464         case 0xA1:
  465                 /*
  466                  * MOV from seg:moffset to AX/EAX/RAX
  467                  * A1:          mov AX, moffs16
  468                  * A1:          mov EAX, moffs32
  469                  * REX.W + A1:  mov RAX, moffs64
  470                  */
  471                 error = memread(vm, vcpuid, gpa, &val, size, arg);
  472                 if (error == 0) {
  473                         reg = VM_REG_GUEST_RAX;
  474                         error = vie_update_register(vm, vcpuid, reg, val, size);
  475                 }
  476                 break;
  477         case 0xA3:
  478                 /*
  479                  * MOV from AX/EAX/RAX to seg:moffset
  480                  * A3:          mov moffs16, AX
  481                  * A3:          mov moffs32, EAX 
  482                  * REX.W + A3:  mov moffs64, RAX
  483                  */
  484                 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
  485                 if (error == 0) {
  486                         val &= size2mask[size];
  487                         error = memwrite(vm, vcpuid, gpa, val, size, arg);
  488                 }
  489                 break;
  490         case 0xC6:
  491                 /*
  492                  * MOV from imm8 to mem (ModRM:r/m)
  493                  * C6/0         mov r/m8, imm8
  494                  * REX + C6/0   mov r/m8, imm8
  495                  */
  496                 size = 1;       /* override for byte operation */
  497                 error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
  498                 break;
  499         case 0xC7:
  500                 /*
  501                  * MOV from imm16/imm32 to mem (ModRM:r/m)
  502                  * C7/0         mov r/m16, imm16
  503                  * C7/0         mov r/m32, imm32
  504                  * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
  505                  */
  506                 val = vie->immediate & size2mask[size];
  507                 error = memwrite(vm, vcpuid, gpa, val, size, arg);
  508                 break;
  509         default:
  510                 break;
  511         }
  512 
  513         return (error);
  514 }
  515 
  516 static int
  517 emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  518              mem_region_read_t memread, mem_region_write_t memwrite,
  519              void *arg)
  520 {
  521         int error, size;
  522         enum vm_reg_name reg;
  523         uint64_t val;
  524 
  525         size = vie->opsize;
  526         error = EINVAL;
  527 
  528         switch (vie->op.op_byte) {
  529         case 0xB6:
  530                 /*
  531                  * MOV and zero extend byte from mem (ModRM:r/m) to
  532                  * reg (ModRM:reg).
  533                  *
  534                  * 0F B6/r              movzx r16, r/m8
  535                  * 0F B6/r              movzx r32, r/m8
  536                  * REX.W + 0F B6/r      movzx r64, r/m8
  537                  */
  538 
  539                 /* get the first operand */
  540                 error = memread(vm, vcpuid, gpa, &val, 1, arg);
  541                 if (error)
  542                         break;
  543 
  544                 /* get the second operand */
  545                 reg = gpr_map[vie->reg];
  546 
  547                 /* zero-extend byte */
  548                 val = (uint8_t)val;
  549 
  550                 /* write the result */
  551                 error = vie_update_register(vm, vcpuid, reg, val, size);
  552                 break;
  553         case 0xB7:
  554                 /*
  555                  * MOV and zero extend word from mem (ModRM:r/m) to
  556                  * reg (ModRM:reg).
  557                  *
  558                  * 0F B7/r              movzx r32, r/m16
  559                  * REX.W + 0F B7/r      movzx r64, r/m16
  560                  */
  561                 error = memread(vm, vcpuid, gpa, &val, 2, arg);
  562                 if (error)
  563                         return (error);
  564 
  565                 reg = gpr_map[vie->reg];
  566 
  567                 /* zero-extend word */
  568                 val = (uint16_t)val;
  569 
  570                 error = vie_update_register(vm, vcpuid, reg, val, size);
  571                 break;
  572         case 0xBE:
  573                 /*
  574                  * MOV and sign extend byte from mem (ModRM:r/m) to
  575                  * reg (ModRM:reg).
  576                  *
  577                  * 0F BE/r              movsx r16, r/m8
  578                  * 0F BE/r              movsx r32, r/m8
  579                  * REX.W + 0F BE/r      movsx r64, r/m8
  580                  */
  581 
  582                 /* get the first operand */
  583                 error = memread(vm, vcpuid, gpa, &val, 1, arg);
  584                 if (error)
  585                         break;
  586 
  587                 /* get the second operand */
  588                 reg = gpr_map[vie->reg];
  589 
  590                 /* sign extend byte */
  591                 val = (int8_t)val;
  592 
  593                 /* write the result */
  594                 error = vie_update_register(vm, vcpuid, reg, val, size);
  595                 break;
  596         default:
  597                 break;
  598         }
  599         return (error);
  600 }
  601 
  602 /*
  603  * Helper function to calculate and validate a linear address.
  604  */
  605 static int
  606 get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
  607     int opsize, int addrsize, int prot, enum vm_reg_name seg,
  608     enum vm_reg_name gpr, uint64_t *gla, int *fault)
  609 {
  610         struct seg_desc desc;
  611         uint64_t cr0, val, rflags;
  612         int error;
  613 
  614         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
  615         KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
  616 
  617         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
  618         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  619 
  620         error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
  621         KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
  622             __func__, error, seg));
  623 
  624         error = vie_read_register(vm, vcpuid, gpr, &val);
  625         KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
  626             error, gpr));
  627 
  628         if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
  629             addrsize, prot, gla)) {
  630                 if (seg == VM_REG_GUEST_SS)
  631                         vm_inject_ss(vm, vcpuid, 0);
  632                 else
  633                         vm_inject_gp(vm, vcpuid);
  634                 goto guest_fault;
  635         }
  636 
  637         if (vie_canonical_check(paging->cpu_mode, *gla)) {
  638                 if (seg == VM_REG_GUEST_SS)
  639                         vm_inject_ss(vm, vcpuid, 0);
  640                 else
  641                         vm_inject_gp(vm, vcpuid);
  642                 goto guest_fault;
  643         }
  644 
  645         if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
  646                 vm_inject_ac(vm, vcpuid, 0);
  647                 goto guest_fault;
  648         }
  649 
  650         *fault = 0;
  651         return (0);
  652 
  653 guest_fault:
  654         *fault = 1;
  655         return (0);
  656 }
  657 
  658 static int
  659 emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  660     struct vm_guest_paging *paging, mem_region_read_t memread,
  661     mem_region_write_t memwrite, void *arg)
  662 {
  663 #ifdef _KERNEL
  664         struct vm_copyinfo copyinfo[2];
  665 #else
  666         struct iovec copyinfo[2];
  667 #endif
  668         uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
  669         uint64_t rcx, rdi, rsi, rflags;
  670         int error, fault, opsize, seg, repeat;
  671 
  672         opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
  673         val = 0;
  674         error = 0;
  675 
  676         /*
  677          * XXX although the MOVS instruction is only supposed to be used with
  678          * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
  679          *
  680          * Empirically the "repnz" prefix has identical behavior to "rep"
  681          * and the zero flag does not make a difference.
  682          */
  683         repeat = vie->repz_present | vie->repnz_present;
  684 
  685         if (repeat) {
  686                 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
  687                 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
  688 
  689                 /*
  690                  * The count register is %rcx, %ecx or %cx depending on the
  691                  * address size of the instruction.
  692                  */
  693                 if ((rcx & vie_size2mask(vie->addrsize)) == 0) {
  694                         error = 0;
  695                         goto done;
  696                 }
  697         }
  698 
  699         /*
  700          *      Source          Destination     Comments
  701          *      --------------------------------------------
  702          * (1)  memory          memory          n/a
  703          * (2)  memory          mmio            emulated
  704          * (3)  mmio            memory          emulated
  705          * (4)  mmio            mmio            emulated
  706          *
  707          * At this point we don't have sufficient information to distinguish
  708          * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
  709          * out because it will succeed only when operating on regular memory.
  710          *
  711          * XXX the emulation doesn't properly handle the case where 'gpa'
  712          * is straddling the boundary between the normal memory and MMIO.
  713          */
  714 
  715         seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
  716         error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
  717             PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault);
  718         if (error || fault)
  719                 goto done;
  720 
  721         error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
  722             copyinfo, nitems(copyinfo), &fault);
  723         if (error == 0) {
  724                 if (fault)
  725                         goto done;      /* Resume guest to handle fault */
  726 
  727                 /*
  728                  * case (2): read from system memory and write to mmio.
  729                  */
  730                 vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
  731                 vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
  732                 error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
  733                 if (error)
  734                         goto done;
  735         } else {
  736                 /*
  737                  * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
  738                  * if 'srcaddr' is in the mmio space.
  739                  */
  740 
  741                 error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
  742                     PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr,
  743                     &fault);
  744                 if (error || fault)
  745                         goto done;
  746 
  747                 error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
  748                     PROT_WRITE, copyinfo, nitems(copyinfo), &fault);
  749                 if (error == 0) {
  750                         if (fault)
  751                                 goto done;    /* Resume guest to handle fault */
  752 
  753                         /*
  754                          * case (3): read from MMIO and write to system memory.
  755                          *
  756                          * A MMIO read can have side-effects so we
  757                          * commit to it only after vm_copy_setup() is
  758                          * successful. If a page-fault needs to be
  759                          * injected into the guest then it will happen
  760                          * before the MMIO read is attempted.
  761                          */
  762                         error = memread(vm, vcpuid, gpa, &val, opsize, arg);
  763                         if (error)
  764                                 goto done;
  765 
  766                         vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
  767                         vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
  768                 } else {
  769                         /*
  770                          * Case (4): read from and write to mmio.
  771                          *
  772                          * Commit to the MMIO read/write (with potential
  773                          * side-effects) only after we are sure that the
  774                          * instruction is not going to be restarted due
  775                          * to address translation faults.
  776                          */
  777                         error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
  778                             PROT_READ, &srcgpa, &fault);
  779                         if (error || fault)
  780                                 goto done;
  781 
  782                         error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
  783                            PROT_WRITE, &dstgpa, &fault);
  784                         if (error || fault)
  785                                 goto done;
  786 
  787                         error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
  788                         if (error)
  789                                 goto done;
  790 
  791                         error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
  792                         if (error)
  793                                 goto done;
  794                 }
  795         }
  796 
  797         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
  798         KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
  799 
  800         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
  801         KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
  802 
  803         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
  804         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  805 
  806         if (rflags & PSL_D) {
  807                 rsi -= opsize;
  808                 rdi -= opsize;
  809         } else {
  810                 rsi += opsize;
  811                 rdi += opsize;
  812         }
  813 
  814         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
  815             vie->addrsize);
  816         KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
  817 
  818         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
  819             vie->addrsize);
  820         KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
  821 
  822         if (repeat) {
  823                 rcx = rcx - 1;
  824                 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
  825                     rcx, vie->addrsize);
  826                 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
  827 
  828                 /*
  829                  * Repeat the instruction if the count register is not zero.
  830                  */
  831                 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
  832                         vm_restart_instruction(vm, vcpuid);
  833         }
  834 done:
  835         KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d",
  836             __func__, error));
  837         return (error);
  838 }
  839 
  840 static int
  841 emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  842     struct vm_guest_paging *paging, mem_region_read_t memread,
  843     mem_region_write_t memwrite, void *arg)
  844 {
  845         int error, opsize, repeat;
  846         uint64_t val;
  847         uint64_t rcx, rdi, rflags;
  848 
  849         opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
  850         repeat = vie->repz_present | vie->repnz_present;
  851 
  852         if (repeat) {
  853                 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
  854                 KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
  855 
  856                 /*
  857                  * The count register is %rcx, %ecx or %cx depending on the
  858                  * address size of the instruction.
  859                  */
  860                 if ((rcx & vie_size2mask(vie->addrsize)) == 0)
  861                         return (0);
  862         }
  863 
  864         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
  865         KASSERT(!error, ("%s: error %d getting rax", __func__, error));
  866 
  867         error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
  868         if (error)
  869                 return (error);
  870 
  871         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
  872         KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
  873 
  874         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
  875         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
  876 
  877         if (rflags & PSL_D)
  878                 rdi -= opsize;
  879         else
  880                 rdi += opsize;
  881 
  882         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
  883             vie->addrsize);
  884         KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
  885 
  886         if (repeat) {
  887                 rcx = rcx - 1;
  888                 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
  889                     rcx, vie->addrsize);
  890                 KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
  891 
  892                 /*
  893                  * Repeat the instruction if the count register is not zero.
  894                  */
  895                 if ((rcx & vie_size2mask(vie->addrsize)) != 0)
  896                         vm_restart_instruction(vm, vcpuid);
  897         }
  898 
  899         return (0);
  900 }
  901 
  902 static int
  903 emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  904             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
  905 {
  906         int error, size;
  907         enum vm_reg_name reg;
  908         uint64_t result, rflags, rflags2, val1, val2;
  909 
  910         size = vie->opsize;
  911         error = EINVAL;
  912 
  913         switch (vie->op.op_byte) {
  914         case 0x23:
  915                 /*
  916                  * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
  917                  * result in reg.
  918                  *
  919                  * 23/r         and r16, r/m16
  920                  * 23/r         and r32, r/m32
  921                  * REX.W + 23/r and r64, r/m64
  922                  */
  923 
  924                 /* get the first operand */
  925                 reg = gpr_map[vie->reg];
  926                 error = vie_read_register(vm, vcpuid, reg, &val1);
  927                 if (error)
  928                         break;
  929 
  930                 /* get the second operand */
  931                 error = memread(vm, vcpuid, gpa, &val2, size, arg);
  932                 if (error)
  933                         break;
  934 
  935                 /* perform the operation and write the result */
  936                 result = val1 & val2;
  937                 error = vie_update_register(vm, vcpuid, reg, result, size);
  938                 break;
  939         case 0x81:
  940         case 0x83:
  941                 /*
  942                  * AND mem (ModRM:r/m) with immediate and store the
  943                  * result in mem.
  944                  *
  945                  * 81 /4                and r/m16, imm16
  946                  * 81 /4                and r/m32, imm32
  947                  * REX.W + 81 /4        and r/m64, imm32 sign-extended to 64
  948                  *
  949                  * 83 /4                and r/m16, imm8 sign-extended to 16
  950                  * 83 /4                and r/m32, imm8 sign-extended to 32
  951                  * REX.W + 83/4         and r/m64, imm8 sign-extended to 64
  952                  */
  953 
  954                 /* get the first operand */
  955                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
  956                 if (error)
  957                         break;
  958 
  959                 /*
  960                  * perform the operation with the pre-fetched immediate
  961                  * operand and write the result
  962                  */
  963                 result = val1 & vie->immediate;
  964                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
  965                 break;
  966         default:
  967                 break;
  968         }
  969         if (error)
  970                 return (error);
  971 
  972         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
  973         if (error)
  974                 return (error);
  975 
  976         /*
  977          * OF and CF are cleared; the SF, ZF and PF flags are set according
  978          * to the result; AF is undefined.
  979          *
  980          * The updated status flags are obtained by subtracting 0 from 'result'.
  981          */
  982         rflags2 = getcc(size, result, 0);
  983         rflags &= ~RFLAGS_STATUS_BITS;
  984         rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
  985 
  986         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
  987         return (error);
  988 }
  989 
  990 static int
  991 emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
  992             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
  993 {
  994         int error, size;
  995         uint64_t val1, result, rflags, rflags2;
  996 
  997         size = vie->opsize;
  998         error = EINVAL;
  999 
 1000         switch (vie->op.op_byte) {
 1001         case 0x81:
 1002         case 0x83:
 1003                 /*
 1004                  * OR mem (ModRM:r/m) with immediate and store the
 1005                  * result in mem.
 1006                  *
 1007                  * 81 /1                or r/m16, imm16
 1008                  * 81 /1                or r/m32, imm32
 1009                  * REX.W + 81 /1        or r/m64, imm32 sign-extended to 64
 1010                  *
 1011                  * 83 /1                or r/m16, imm8 sign-extended to 16
 1012                  * 83 /1                or r/m32, imm8 sign-extended to 32
 1013                  * REX.W + 83/1         or r/m64, imm8 sign-extended to 64
 1014                  */
 1015 
 1016                 /* get the first operand */
 1017                 error = memread(vm, vcpuid, gpa, &val1, size, arg);
 1018                 if (error)
 1019                         break;
 1020 
 1021                 /*
 1022                  * perform the operation with the pre-fetched immediate
 1023                  * operand and write the result
 1024                  */
 1025                 result = val1 | vie->immediate;
 1026                 error = memwrite(vm, vcpuid, gpa, result, size, arg);
 1027                 break;
 1028         default:
 1029                 break;
 1030         }
 1031         if (error)
 1032                 return (error);
 1033 
 1034         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 1035         if (error)
 1036                 return (error);
 1037 
 1038         /*
 1039          * OF and CF are cleared; the SF, ZF and PF flags are set according
 1040          * to the result; AF is undefined.
 1041          *
 1042          * The updated status flags are obtained by subtracting 0 from 'result'.
 1043          */
 1044         rflags2 = getcc(size, result, 0);
 1045         rflags &= ~RFLAGS_STATUS_BITS;
 1046         rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
 1047 
 1048         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 1049         return (error);
 1050 }
 1051 
 1052 static int
 1053 emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 1054             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 1055 {
 1056         int error, size;
 1057         uint64_t regop, memop, op1, op2, rflags, rflags2;
 1058         enum vm_reg_name reg;
 1059 
 1060         size = vie->opsize;
 1061         switch (vie->op.op_byte) {
 1062         case 0x39:
 1063         case 0x3B:
 1064                 /*
 1065                  * 39/r         CMP r/m16, r16
 1066                  * 39/r         CMP r/m32, r32
 1067                  * REX.W 39/r   CMP r/m64, r64
 1068                  *
 1069                  * 3B/r         CMP r16, r/m16
 1070                  * 3B/r         CMP r32, r/m32
 1071                  * REX.W + 3B/r CMP r64, r/m64
 1072                  *
 1073                  * Compare the first operand with the second operand and
 1074                  * set status flags in EFLAGS register. The comparison is
 1075                  * performed by subtracting the second operand from the first
 1076                  * operand and then setting the status flags.
 1077                  */
 1078 
 1079                 /* Get the register operand */
 1080                 reg = gpr_map[vie->reg];
 1081                 error = vie_read_register(vm, vcpuid, reg, &regop);
 1082                 if (error)
 1083                         return (error);
 1084 
 1085                 /* Get the memory operand */
 1086                 error = memread(vm, vcpuid, gpa, &memop, size, arg);
 1087                 if (error)
 1088                         return (error);
 1089 
 1090                 if (vie->op.op_byte == 0x3B) {
 1091                         op1 = regop;
 1092                         op2 = memop;
 1093                 } else {
 1094                         op1 = memop;
 1095                         op2 = regop;
 1096                 }
 1097                 rflags2 = getcc(size, op1, op2);
 1098                 break;
 1099         case 0x80:
 1100         case 0x81:
 1101         case 0x83:
 1102                 /*
 1103                  * 80 /7                cmp r/m8, imm8
 1104                  * REX + 80 /7          cmp r/m8, imm8
 1105                  *
 1106                  * 81 /7                cmp r/m16, imm16
 1107                  * 81 /7                cmp r/m32, imm32
 1108                  * REX.W + 81 /7        cmp r/m64, imm32 sign-extended to 64
 1109                  *
 1110                  * 83 /7                cmp r/m16, imm8 sign-extended to 16
 1111                  * 83 /7                cmp r/m32, imm8 sign-extended to 32
 1112                  * REX.W + 83 /7        cmp r/m64, imm8 sign-extended to 64
 1113                  *
 1114                  * Compare mem (ModRM:r/m) with immediate and set
 1115                  * status flags according to the results.  The
 1116                  * comparison is performed by subtracting the
 1117                  * immediate from the first operand and then setting
 1118                  * the status flags.
 1119                  *
 1120                  */
 1121                 if (vie->op.op_byte == 0x80)
 1122                         size = 1;
 1123 
 1124                 /* get the first operand */
 1125                 error = memread(vm, vcpuid, gpa, &op1, size, arg);
 1126                 if (error)
 1127                         return (error);
 1128 
 1129                 rflags2 = getcc(size, op1, vie->immediate);
 1130                 break;
 1131         default:
 1132                 return (EINVAL);
 1133         }
 1134         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 1135         if (error)
 1136                 return (error);
 1137         rflags &= ~RFLAGS_STATUS_BITS;
 1138         rflags |= rflags2 & RFLAGS_STATUS_BITS;
 1139 
 1140         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 1141         return (error);
 1142 }
 1143 
 1144 static int
 1145 emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 1146             mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
 1147 {
 1148         int error, size;
 1149         uint64_t nval, rflags, rflags2, val1, val2;
 1150         enum vm_reg_name reg;
 1151 
 1152         size = vie->opsize;
 1153         error = EINVAL;
 1154 
 1155         switch (vie->op.op_byte) {
 1156         case 0x2B:
 1157                 /*
 1158                  * SUB r/m from r and store the result in r
 1159                  * 
 1160                  * 2B/r            SUB r16, r/m16
 1161                  * 2B/r            SUB r32, r/m32
 1162                  * REX.W + 2B/r    SUB r64, r/m64
 1163                  */
 1164 
 1165                 /* get the first operand */
 1166                 reg = gpr_map[vie->reg];
 1167                 error = vie_read_register(vm, vcpuid, reg, &val1);
 1168                 if (error)
 1169                         break;
 1170 
 1171                 /* get the second operand */
 1172                 error = memread(vm, vcpuid, gpa, &val2, size, arg);
 1173                 if (error)
 1174                         break;
 1175 
 1176                 /* perform the operation and write the result */
 1177                 nval = val1 - val2;
 1178                 error = vie_update_register(vm, vcpuid, reg, nval, size);
 1179                 break;
 1180         default:
 1181                 break;
 1182         }
 1183 
 1184         if (!error) {
 1185                 rflags2 = getcc(size, val1, val2);
 1186                 error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 1187                     &rflags);
 1188                 if (error)
 1189                         return (error);
 1190 
 1191                 rflags &= ~RFLAGS_STATUS_BITS;
 1192                 rflags |= rflags2 & RFLAGS_STATUS_BITS;
 1193                 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
 1194                     rflags, 8);
 1195         }
 1196 
 1197         return (error);
 1198 }
 1199 
 1200 static int
 1201 emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 1202     struct vm_guest_paging *paging, mem_region_read_t memread,
 1203     mem_region_write_t memwrite, void *arg)
 1204 {
 1205 #ifdef _KERNEL
 1206         struct vm_copyinfo copyinfo[2];
 1207 #else
 1208         struct iovec copyinfo[2];
 1209 #endif
 1210         struct seg_desc ss_desc;
 1211         uint64_t cr0, rflags, rsp, stack_gla, val;
 1212         int error, fault, size, stackaddrsize, pushop;
 1213 
 1214         val = 0;
 1215         size = vie->opsize;
 1216         pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
 1217 
 1218         /*
 1219          * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
 1220          */
 1221         if (paging->cpu_mode == CPU_MODE_REAL) {
 1222                 stackaddrsize = 2;
 1223         } else if (paging->cpu_mode == CPU_MODE_64BIT) {
 1224                 /*
 1225                  * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
 1226                  * - Stack pointer size is always 64-bits.
 1227                  * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
 1228                  * - 16-bit PUSH/POP is supported by using the operand size
 1229                  *   override prefix (66H).
 1230                  */
 1231                 stackaddrsize = 8;
 1232                 size = vie->opsize_override ? 2 : 8;
 1233         } else {
 1234                 /*
 1235                  * In protected or compatibility mode the 'B' flag in the
 1236                  * stack-segment descriptor determines the size of the
 1237                  * stack pointer.
 1238                  */
 1239                 error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
 1240                 KASSERT(error == 0, ("%s: error %d getting SS descriptor",
 1241                     __func__, error));
 1242                 if (SEG_DESC_DEF32(ss_desc.access))
 1243                         stackaddrsize = 4;
 1244                 else
 1245                         stackaddrsize = 2;
 1246         }
 1247 
 1248         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
 1249         KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
 1250 
 1251         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 1252         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 1253 
 1254         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
 1255         KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
 1256         if (pushop) {
 1257                 rsp -= size;
 1258         }
 1259 
 1260         if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
 1261             rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
 1262             &stack_gla)) {
 1263                 vm_inject_ss(vm, vcpuid, 0);
 1264                 return (0);
 1265         }
 1266 
 1267         if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
 1268                 vm_inject_ss(vm, vcpuid, 0);
 1269                 return (0);
 1270         }
 1271 
 1272         if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
 1273                 vm_inject_ac(vm, vcpuid, 0);
 1274                 return (0);
 1275         }
 1276 
 1277         error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
 1278             pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo),
 1279             &fault);
 1280         if (error || fault)
 1281                 return (error);
 1282 
 1283         if (pushop) {
 1284                 error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
 1285                 if (error == 0)
 1286                         vm_copyout(vm, vcpuid, &val, copyinfo, size);
 1287         } else {
 1288                 vm_copyin(vm, vcpuid, copyinfo, &val, size);
 1289                 error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
 1290                 rsp += size;
 1291         }
 1292         vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 1293 
 1294         if (error == 0) {
 1295                 error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
 1296                     stackaddrsize);
 1297                 KASSERT(error == 0, ("error %d updating rsp", error));
 1298         }
 1299         return (error);
 1300 }
 1301 
 1302 static int
 1303 emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 1304     struct vm_guest_paging *paging, mem_region_read_t memread,
 1305     mem_region_write_t memwrite, void *arg)
 1306 {
 1307         int error;
 1308 
 1309         /*
 1310          * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 1311          *
 1312          * PUSH is part of the group 5 extended opcodes and is identified
 1313          * by ModRM:reg = b110.
 1314          */
 1315         if ((vie->reg & 7) != 6)
 1316                 return (EINVAL);
 1317 
 1318         error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 1319             memwrite, arg);
 1320         return (error);
 1321 }
 1322 
 1323 static int
 1324 emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
 1325     struct vm_guest_paging *paging, mem_region_read_t memread,
 1326     mem_region_write_t memwrite, void *arg)
 1327 {
 1328         int error;
 1329 
 1330         /*
 1331          * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
 1332          *
 1333          * POP is part of the group 1A extended opcodes and is identified
 1334          * by ModRM:reg = b000.
 1335          */
 1336         if ((vie->reg & 7) != 0)
 1337                 return (EINVAL);
 1338 
 1339         error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
 1340             memwrite, arg);
 1341         return (error);
 1342 }
 1343 
 1344 static int
 1345 emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 1346     struct vm_guest_paging *paging, mem_region_read_t memread,
 1347     mem_region_write_t memwrite, void *memarg)
 1348 {
 1349         int error;
 1350 
 1351         switch (vie->reg & 7) {
 1352         case 0x1:       /* OR */
 1353                 error = emulate_or(vm, vcpuid, gpa, vie,
 1354                     memread, memwrite, memarg);
 1355                 break;
 1356         case 0x4:       /* AND */
 1357                 error = emulate_and(vm, vcpuid, gpa, vie,
 1358                     memread, memwrite, memarg);
 1359                 break;
 1360         case 0x7:       /* CMP */
 1361                 error = emulate_cmp(vm, vcpuid, gpa, vie,
 1362                     memread, memwrite, memarg);
 1363                 break;
 1364         default:
 1365                 error = EINVAL;
 1366                 break;
 1367         }
 1368 
 1369         return (error);
 1370 }
 1371 
 1372 static int
 1373 emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 1374     mem_region_read_t memread, mem_region_write_t memwrite, void *memarg)
 1375 {
 1376         uint64_t val, rflags;
 1377         int error, bitmask, bitoff;
 1378 
 1379         /*
 1380          * 0F BA is a Group 8 extended opcode.
 1381          *
 1382          * Currently we only emulate the 'Bit Test' instruction which is
 1383          * identified by a ModR/M:reg encoding of 100b.
 1384          */
 1385         if ((vie->reg & 7) != 4)
 1386                 return (EINVAL);
 1387 
 1388         error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
 1389         KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
 1390 
 1391         error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg);
 1392         if (error)
 1393                 return (error);
 1394 
 1395         /*
 1396          * Intel SDM, Vol 2, Table 3-2:
 1397          * "Range of Bit Positions Specified by Bit Offset Operands"
 1398          */
 1399         bitmask = vie->opsize * 8 - 1;
 1400         bitoff = vie->immediate & bitmask;
 1401 
 1402         /* Copy the bit into the Carry flag in %rflags */
 1403         if (val & (1UL << bitoff))
 1404                 rflags |= PSL_C;
 1405         else
 1406                 rflags &= ~PSL_C;
 1407 
 1408         error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
 1409         KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error));
 1410 
 1411         return (0);
 1412 }
 1413 
 1414 int
 1415 vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
 1416     struct vm_guest_paging *paging, mem_region_read_t memread,
 1417     mem_region_write_t memwrite, void *memarg)
 1418 {
 1419         int error;
 1420 
 1421         if (!vie->decoded)
 1422                 return (EINVAL);
 1423 
 1424         switch (vie->op.op_type) {
 1425         case VIE_OP_TYPE_GROUP1:
 1426                 error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
 1427                     memwrite, memarg);
 1428                 break;
 1429         case VIE_OP_TYPE_POP:
 1430                 error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
 1431                     memwrite, memarg);
 1432                 break;
 1433         case VIE_OP_TYPE_PUSH:
 1434                 error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
 1435                     memwrite, memarg);
 1436                 break;
 1437         case VIE_OP_TYPE_CMP:
 1438                 error = emulate_cmp(vm, vcpuid, gpa, vie,
 1439                                     memread, memwrite, memarg);
 1440                 break;
 1441         case VIE_OP_TYPE_MOV:
 1442                 error = emulate_mov(vm, vcpuid, gpa, vie,
 1443                                     memread, memwrite, memarg);
 1444                 break;
 1445         case VIE_OP_TYPE_MOVSX:
 1446         case VIE_OP_TYPE_MOVZX:
 1447                 error = emulate_movx(vm, vcpuid, gpa, vie,
 1448                                      memread, memwrite, memarg);
 1449                 break;
 1450         case VIE_OP_TYPE_MOVS:
 1451                 error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
 1452                     memwrite, memarg);
 1453                 break;
 1454         case VIE_OP_TYPE_STOS:
 1455                 error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
 1456                     memwrite, memarg);
 1457                 break;
 1458         case VIE_OP_TYPE_AND:
 1459                 error = emulate_and(vm, vcpuid, gpa, vie,
 1460                                     memread, memwrite, memarg);
 1461                 break;
 1462         case VIE_OP_TYPE_OR:
 1463                 error = emulate_or(vm, vcpuid, gpa, vie,
 1464                                     memread, memwrite, memarg);
 1465                 break;
 1466         case VIE_OP_TYPE_SUB:
 1467                 error = emulate_sub(vm, vcpuid, gpa, vie,
 1468                                     memread, memwrite, memarg);
 1469                 break;
 1470         case VIE_OP_TYPE_BITTEST:
 1471                 error = emulate_bittest(vm, vcpuid, gpa, vie,
 1472                     memread, memwrite, memarg);
 1473                 break;
 1474         default:
 1475                 error = EINVAL;
 1476                 break;
 1477         }
 1478 
 1479         return (error);
 1480 }
 1481 
 1482 int
 1483 vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
 1484 {
 1485         KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 1486             ("%s: invalid size %d", __func__, size));
 1487         KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
 1488 
 1489         if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
 1490                 return (0);
 1491 
 1492         return ((gla & (size - 1)) ? 1 : 0);
 1493 }
 1494 
 1495 int
 1496 vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
 1497 {
 1498         uint64_t mask;
 1499 
 1500         if (cpu_mode != CPU_MODE_64BIT)
 1501                 return (0);
 1502 
 1503         /*
 1504          * The value of the bit 47 in the 'gla' should be replicated in the
 1505          * most significant 16 bits.
 1506          */
 1507         mask = ~((1UL << 48) - 1);
 1508         if (gla & (1UL << 47))
 1509                 return ((gla & mask) != mask);
 1510         else
 1511                 return ((gla & mask) != 0);
 1512 }
 1513 
 1514 uint64_t
 1515 vie_size2mask(int size)
 1516 {
 1517         KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
 1518             ("vie_size2mask: invalid size %d", size));
 1519         return (size2mask[size]);
 1520 }
 1521 
 1522 int
 1523 vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
 1524     struct seg_desc *desc, uint64_t offset, int length, int addrsize,
 1525     int prot, uint64_t *gla)
 1526 {
 1527         uint64_t firstoff, low_limit, high_limit, segbase;
 1528         int glasize, type;
 1529 
 1530         KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
 1531             ("%s: invalid segment %d", __func__, seg));
 1532         KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
 1533             ("%s: invalid operand size %d", __func__, length));
 1534         KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
 1535             ("%s: invalid prot %#x", __func__, prot));
 1536 
 1537         firstoff = offset;
 1538         if (cpu_mode == CPU_MODE_64BIT) {
 1539                 KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
 1540                     "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
 1541                 glasize = 8;
 1542         } else {
 1543                 KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
 1544                     "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
 1545                 glasize = 4;
 1546                 /*
 1547                  * If the segment selector is loaded with a NULL selector
 1548                  * then the descriptor is unusable and attempting to use
 1549                  * it results in a #GP(0).
 1550                  */
 1551                 if (SEG_DESC_UNUSABLE(desc->access))
 1552                         return (-1);
 1553 
 1554                 /* 
 1555                  * The processor generates a #NP exception when a segment
 1556                  * register is loaded with a selector that points to a
 1557                  * descriptor that is not present. If this was the case then
 1558                  * it would have been checked before the VM-exit.
 1559                  */
 1560                 KASSERT(SEG_DESC_PRESENT(desc->access),
 1561                     ("segment %d not present: %#x", seg, desc->access));
 1562 
 1563                 /*
 1564                  * The descriptor type must indicate a code/data segment.
 1565                  */
 1566                 type = SEG_DESC_TYPE(desc->access);
 1567                 KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
 1568                     "descriptor type %#x", seg, type));
 1569 
 1570                 if (prot & PROT_READ) {
 1571                         /* #GP on a read access to a exec-only code segment */
 1572                         if ((type & 0xA) == 0x8)
 1573                                 return (-1);
 1574                 }
 1575 
 1576                 if (prot & PROT_WRITE) {
 1577                         /*
 1578                          * #GP on a write access to a code segment or a
 1579                          * read-only data segment.
 1580                          */
 1581                         if (type & 0x8)                 /* code segment */
 1582                                 return (-1);
 1583 
 1584                         if ((type & 0xA) == 0)          /* read-only data seg */
 1585                                 return (-1);
 1586                 }
 1587 
 1588                 /*
 1589                  * 'desc->limit' is fully expanded taking granularity into
 1590                  * account.
 1591                  */
 1592                 if ((type & 0xC) == 0x4) {
 1593                         /* expand-down data segment */
 1594                         low_limit = desc->limit + 1;
 1595                         high_limit = SEG_DESC_DEF32(desc->access) ?
 1596                             0xffffffff : 0xffff;
 1597                 } else {
 1598                         /* code segment or expand-up data segment */
 1599                         low_limit = 0;
 1600                         high_limit = desc->limit;
 1601                 }
 1602 
 1603                 while (length > 0) {
 1604                         offset &= vie_size2mask(addrsize);
 1605                         if (offset < low_limit || offset > high_limit)
 1606                                 return (-1);
 1607                         offset++;
 1608                         length--;
 1609                 }
 1610         }
 1611 
 1612         /*
 1613          * In 64-bit mode all segments except %fs and %gs have a segment
 1614          * base address of 0.
 1615          */
 1616         if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 1617             seg != VM_REG_GUEST_GS) {
 1618                 segbase = 0;
 1619         } else {
 1620                 segbase = desc->base;
 1621         }
 1622 
 1623         /*
 1624          * Truncate 'firstoff' to the effective address size before adding
 1625          * it to the segment base.
 1626          */
 1627         firstoff &= vie_size2mask(addrsize);
 1628         *gla = (segbase + firstoff) & vie_size2mask(glasize);
 1629         return (0);
 1630 }
 1631 
 1632 #ifdef _KERNEL
 1633 void
 1634 vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
 1635 {
 1636         KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
 1637             ("%s: invalid instruction length (%d)", __func__, inst_length));
 1638 
 1639         bzero(vie, sizeof(struct vie));
 1640 
 1641         vie->base_register = VM_REG_LAST;
 1642         vie->index_register = VM_REG_LAST;
 1643         vie->segment_register = VM_REG_LAST;
 1644 
 1645         if (inst_length) {
 1646                 bcopy(inst_bytes, vie->inst, inst_length);
 1647                 vie->num_valid = inst_length;
 1648         }
 1649 }
 1650 
 1651 static int
 1652 pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
 1653 {
 1654         int error_code = 0;
 1655 
 1656         if (pte & PG_V)
 1657                 error_code |= PGEX_P;
 1658         if (prot & VM_PROT_WRITE)
 1659                 error_code |= PGEX_W;
 1660         if (usermode)
 1661                 error_code |= PGEX_U;
 1662         if (rsvd)
 1663                 error_code |= PGEX_RSV;
 1664         if (prot & VM_PROT_EXECUTE)
 1665                 error_code |= PGEX_I;
 1666 
 1667         return (error_code);
 1668 }
 1669 
 1670 static void
 1671 ptp_release(void **cookie)
 1672 {
 1673         if (*cookie != NULL) {
 1674                 vm_gpa_release(*cookie);
 1675                 *cookie = NULL;
 1676         }
 1677 }
 1678 
 1679 static void *
 1680 ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie)
 1681 {
 1682         void *ptr;
 1683 
 1684         ptp_release(cookie);
 1685         ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie);
 1686         return (ptr);
 1687 }
 1688 
 1689 int
 1690 vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 1691     uint64_t gla, int prot, uint64_t *gpa, int *guest_fault)
 1692 {
 1693         int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
 1694         u_int retries;
 1695         uint64_t *ptpbase, ptpphys, pte, pgsize;
 1696         uint32_t *ptpbase32, pte32;
 1697         void *cookie;
 1698 
 1699         *guest_fault = 0;
 1700 
 1701         usermode = (paging->cpl == 3 ? 1 : 0);
 1702         writable = prot & VM_PROT_WRITE;
 1703         cookie = NULL;
 1704         retval = 0;
 1705         retries = 0;
 1706 restart:
 1707         ptpphys = paging->cr3;          /* root of the page tables */
 1708         ptp_release(&cookie);
 1709         if (retries++ > 0)
 1710                 maybe_yield();
 1711 
 1712         if (vie_canonical_check(paging->cpu_mode, gla)) {
 1713                 /*
 1714                  * XXX assuming a non-stack reference otherwise a stack fault
 1715                  * should be generated.
 1716                  */
 1717                 vm_inject_gp(vm, vcpuid);
 1718                 goto fault;
 1719         }
 1720 
 1721         if (paging->paging_mode == PAGING_MODE_FLAT) {
 1722                 *gpa = gla;
 1723                 goto done;
 1724         }
 1725 
 1726         if (paging->paging_mode == PAGING_MODE_32) {
 1727                 nlevels = 2;
 1728                 while (--nlevels >= 0) {
 1729                         /* Zero out the lower 12 bits. */
 1730                         ptpphys &= ~0xfff;
 1731 
 1732                         ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE,
 1733                             &cookie);
 1734 
 1735                         if (ptpbase32 == NULL)
 1736                                 goto error;
 1737 
 1738                         ptpshift = PAGE_SHIFT + nlevels * 10;
 1739                         ptpindex = (gla >> ptpshift) & 0x3FF;
 1740                         pgsize = 1UL << ptpshift;
 1741 
 1742                         pte32 = ptpbase32[ptpindex];
 1743 
 1744                         if ((pte32 & PG_V) == 0 ||
 1745                             (usermode && (pte32 & PG_U) == 0) ||
 1746                             (writable && (pte32 & PG_RW) == 0)) {
 1747                                 pfcode = pf_error_code(usermode, prot, 0,
 1748                                     pte32);
 1749                                 vm_inject_pf(vm, vcpuid, pfcode, gla);
 1750                                 goto fault;
 1751                         }
 1752 
 1753                         /*
 1754                          * Emulate the x86 MMU's management of the accessed
 1755                          * and dirty flags. While the accessed flag is set
 1756                          * at every level of the page table, the dirty flag
 1757                          * is only set at the last level providing the guest
 1758                          * physical address.
 1759                          */
 1760                         if ((pte32 & PG_A) == 0) {
 1761                                 if (atomic_cmpset_32(&ptpbase32[ptpindex],
 1762                                     pte32, pte32 | PG_A) == 0) {
 1763                                         goto restart;
 1764                                 }
 1765                         }
 1766 
 1767                         /* XXX must be ignored if CR4.PSE=0 */
 1768                         if (nlevels > 0 && (pte32 & PG_PS) != 0)
 1769                                 break;
 1770 
 1771                         ptpphys = pte32;
 1772                 }
 1773 
 1774                 /* Set the dirty bit in the page table entry if necessary */
 1775                 if (writable && (pte32 & PG_M) == 0) {
 1776                         if (atomic_cmpset_32(&ptpbase32[ptpindex],
 1777                             pte32, pte32 | PG_M) == 0) {
 1778                                 goto restart;
 1779                         }
 1780                 }
 1781 
 1782                 /* Zero out the lower 'ptpshift' bits */
 1783                 pte32 >>= ptpshift; pte32 <<= ptpshift;
 1784                 *gpa = pte32 | (gla & (pgsize - 1));
 1785                 goto done;
 1786         }
 1787 
 1788         if (paging->paging_mode == PAGING_MODE_PAE) {
 1789                 /* Zero out the lower 5 bits and the upper 32 bits */
 1790                 ptpphys &= 0xffffffe0UL;
 1791 
 1792                 ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4,
 1793                     &cookie);
 1794                 if (ptpbase == NULL)
 1795                         goto error;
 1796 
 1797                 ptpindex = (gla >> 30) & 0x3;
 1798 
 1799                 pte = ptpbase[ptpindex];
 1800 
 1801                 if ((pte & PG_V) == 0) {
 1802                         pfcode = pf_error_code(usermode, prot, 0, pte);
 1803                         vm_inject_pf(vm, vcpuid, pfcode, gla);
 1804                         goto fault;
 1805                 }
 1806 
 1807                 ptpphys = pte;
 1808 
 1809                 nlevels = 2;
 1810         } else
 1811                 nlevels = 4;
 1812         while (--nlevels >= 0) {
 1813                 /* Zero out the lower 12 bits and the upper 12 bits */
 1814                 ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
 1815 
 1816                 ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie);
 1817                 if (ptpbase == NULL)
 1818                         goto error;
 1819 
 1820                 ptpshift = PAGE_SHIFT + nlevels * 9;
 1821                 ptpindex = (gla >> ptpshift) & 0x1FF;
 1822                 pgsize = 1UL << ptpshift;
 1823 
 1824                 pte = ptpbase[ptpindex];
 1825 
 1826                 if ((pte & PG_V) == 0 ||
 1827                     (usermode && (pte & PG_U) == 0) ||
 1828                     (writable && (pte & PG_RW) == 0)) {
 1829                         pfcode = pf_error_code(usermode, prot, 0, pte);
 1830                         vm_inject_pf(vm, vcpuid, pfcode, gla);
 1831                         goto fault;
 1832                 }
 1833 
 1834                 /* Set the accessed bit in the page table entry */
 1835                 if ((pte & PG_A) == 0) {
 1836                         if (atomic_cmpset_64(&ptpbase[ptpindex],
 1837                             pte, pte | PG_A) == 0) {
 1838                                 goto restart;
 1839                         }
 1840                 }
 1841 
 1842                 if (nlevels > 0 && (pte & PG_PS) != 0) {
 1843                         if (pgsize > 1 * GB) {
 1844                                 pfcode = pf_error_code(usermode, prot, 1, pte);
 1845                                 vm_inject_pf(vm, vcpuid, pfcode, gla);
 1846                                 goto fault;
 1847                         }
 1848                         break;
 1849                 }
 1850 
 1851                 ptpphys = pte;
 1852         }
 1853 
 1854         /* Set the dirty bit in the page table entry if necessary */
 1855         if (writable && (pte & PG_M) == 0) {
 1856                 if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
 1857                         goto restart;
 1858         }
 1859 
 1860         /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
 1861         pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
 1862         *gpa = pte | (gla & (pgsize - 1));
 1863 done:
 1864         ptp_release(&cookie);
 1865         KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d",
 1866             __func__, retval));
 1867         return (retval);
 1868 error:
 1869         retval = EFAULT;
 1870         goto done;
 1871 fault:
 1872         *guest_fault = 1;
 1873         goto done;
 1874 }
 1875 
 1876 int
 1877 vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
 1878     uint64_t rip, int inst_length, struct vie *vie, int *faultptr)
 1879 {
 1880         struct vm_copyinfo copyinfo[2];
 1881         int error, prot;
 1882 
 1883         if (inst_length > VIE_INST_SIZE)
 1884                 panic("vmm_fetch_instruction: invalid length %d", inst_length);
 1885 
 1886         prot = PROT_READ | PROT_EXEC;
 1887         error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
 1888             copyinfo, nitems(copyinfo), faultptr);
 1889         if (error || *faultptr)
 1890                 return (error);
 1891 
 1892         vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
 1893         vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
 1894         vie->num_valid = inst_length;
 1895         return (0);
 1896 }
 1897 
 1898 static int
 1899 vie_peek(struct vie *vie, uint8_t *x)
 1900 {
 1901 
 1902         if (vie->num_processed < vie->num_valid) {
 1903                 *x = vie->inst[vie->num_processed];
 1904                 return (0);
 1905         } else
 1906                 return (-1);
 1907 }
 1908 
 1909 static void
 1910 vie_advance(struct vie *vie)
 1911 {
 1912 
 1913         vie->num_processed++;
 1914 }
 1915 
 1916 static bool
 1917 segment_override(uint8_t x, int *seg)
 1918 {
 1919 
 1920         switch (x) {
 1921         case 0x2E:
 1922                 *seg = VM_REG_GUEST_CS;
 1923                 break;
 1924         case 0x36:
 1925                 *seg = VM_REG_GUEST_SS;
 1926                 break;
 1927         case 0x3E:
 1928                 *seg = VM_REG_GUEST_DS;
 1929                 break;
 1930         case 0x26:
 1931                 *seg = VM_REG_GUEST_ES;
 1932                 break;
 1933         case 0x64:
 1934                 *seg = VM_REG_GUEST_FS;
 1935                 break;
 1936         case 0x65:
 1937                 *seg = VM_REG_GUEST_GS;
 1938                 break;
 1939         default:
 1940                 return (false);
 1941         }
 1942         return (true);
 1943 }
 1944 
 1945 static int
 1946 decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
 1947 {
 1948         uint8_t x;
 1949 
 1950         while (1) {
 1951                 if (vie_peek(vie, &x))
 1952                         return (-1);
 1953 
 1954                 if (x == 0x66)
 1955                         vie->opsize_override = 1;
 1956                 else if (x == 0x67)
 1957                         vie->addrsize_override = 1;
 1958                 else if (x == 0xF3)
 1959                         vie->repz_present = 1;
 1960                 else if (x == 0xF2)
 1961                         vie->repnz_present = 1;
 1962                 else if (segment_override(x, &vie->segment_register))
 1963                         vie->segment_override = 1;
 1964                 else
 1965                         break;
 1966 
 1967                 vie_advance(vie);
 1968         }
 1969 
 1970         /*
 1971          * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
 1972          * - Only one REX prefix is allowed per instruction.
 1973          * - The REX prefix must immediately precede the opcode byte or the
 1974          *   escape opcode byte.
 1975          * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
 1976          *   the mandatory prefix must come before the REX prefix.
 1977          */
 1978         if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
 1979                 vie->rex_present = 1;
 1980                 vie->rex_w = x & 0x8 ? 1 : 0;
 1981                 vie->rex_r = x & 0x4 ? 1 : 0;
 1982                 vie->rex_x = x & 0x2 ? 1 : 0;
 1983                 vie->rex_b = x & 0x1 ? 1 : 0;
 1984                 vie_advance(vie);
 1985         }
 1986 
 1987         /*
 1988          * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
 1989          */
 1990         if (cpu_mode == CPU_MODE_64BIT) {
 1991                 /*
 1992                  * Default address size is 64-bits and default operand size
 1993                  * is 32-bits.
 1994                  */
 1995                 vie->addrsize = vie->addrsize_override ? 4 : 8;
 1996                 if (vie->rex_w)
 1997                         vie->opsize = 8;
 1998                 else if (vie->opsize_override)
 1999                         vie->opsize = 2;
 2000                 else
 2001                         vie->opsize = 4;
 2002         } else if (cs_d) {
 2003                 /* Default address and operand sizes are 32-bits */
 2004                 vie->addrsize = vie->addrsize_override ? 2 : 4;
 2005                 vie->opsize = vie->opsize_override ? 2 : 4;
 2006         } else {
 2007                 /* Default address and operand sizes are 16-bits */
 2008                 vie->addrsize = vie->addrsize_override ? 4 : 2;
 2009                 vie->opsize = vie->opsize_override ? 4 : 2;
 2010         }
 2011         return (0);
 2012 }
 2013 
 2014 static int
 2015 decode_two_byte_opcode(struct vie *vie)
 2016 {
 2017         uint8_t x;
 2018 
 2019         if (vie_peek(vie, &x))
 2020                 return (-1);
 2021 
 2022         vie->op = two_byte_opcodes[x];
 2023 
 2024         if (vie->op.op_type == VIE_OP_TYPE_NONE)
 2025                 return (-1);
 2026 
 2027         vie_advance(vie);
 2028         return (0);
 2029 }
 2030 
 2031 static int
 2032 decode_opcode(struct vie *vie)
 2033 {
 2034         uint8_t x;
 2035 
 2036         if (vie_peek(vie, &x))
 2037                 return (-1);
 2038 
 2039         vie->op = one_byte_opcodes[x];
 2040 
 2041         if (vie->op.op_type == VIE_OP_TYPE_NONE)
 2042                 return (-1);
 2043 
 2044         vie_advance(vie);
 2045 
 2046         if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
 2047                 return (decode_two_byte_opcode(vie));
 2048 
 2049         return (0);
 2050 }
 2051 
 2052 static int
 2053 decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
 2054 {
 2055         uint8_t x;
 2056 
 2057         if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
 2058                 return (0);
 2059 
 2060         if (cpu_mode == CPU_MODE_REAL)
 2061                 return (-1);
 2062 
 2063         if (vie_peek(vie, &x))
 2064                 return (-1);
 2065 
 2066         vie->mod = (x >> 6) & 0x3;
 2067         vie->rm =  (x >> 0) & 0x7;
 2068         vie->reg = (x >> 3) & 0x7;
 2069 
 2070         /*
 2071          * A direct addressing mode makes no sense in the context of an EPT
 2072          * fault. There has to be a memory access involved to cause the
 2073          * EPT fault.
 2074          */
 2075         if (vie->mod == VIE_MOD_DIRECT)
 2076                 return (-1);
 2077 
 2078         if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
 2079             (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
 2080                 /*
 2081                  * Table 2-5: Special Cases of REX Encodings
 2082                  *
 2083                  * mod=0, r/m=5 is used in the compatibility mode to
 2084                  * indicate a disp32 without a base register.
 2085                  *
 2086                  * mod!=3, r/m=4 is used in the compatibility mode to
 2087                  * indicate that the SIB byte is present.
 2088                  *
 2089                  * The 'b' bit in the REX prefix is don't care in
 2090                  * this case.
 2091                  */
 2092         } else {
 2093                 vie->rm |= (vie->rex_b << 3);
 2094         }
 2095 
 2096         vie->reg |= (vie->rex_r << 3);
 2097 
 2098         /* SIB */
 2099         if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
 2100                 goto done;
 2101 
 2102         vie->base_register = gpr_map[vie->rm];
 2103 
 2104         switch (vie->mod) {
 2105         case VIE_MOD_INDIRECT_DISP8:
 2106                 vie->disp_bytes = 1;
 2107                 break;
 2108         case VIE_MOD_INDIRECT_DISP32:
 2109                 vie->disp_bytes = 4;
 2110                 break;
 2111         case VIE_MOD_INDIRECT:
 2112                 if (vie->rm == VIE_RM_DISP32) {
 2113                         vie->disp_bytes = 4;
 2114                         /*
 2115                          * Table 2-7. RIP-Relative Addressing
 2116                          *
 2117                          * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
 2118                          * whereas in compatibility mode it just implies disp32.
 2119                          */
 2120 
 2121                         if (cpu_mode == CPU_MODE_64BIT)
 2122                                 vie->base_register = VM_REG_GUEST_RIP;
 2123                         else
 2124                                 vie->base_register = VM_REG_LAST;
 2125                 }
 2126                 break;
 2127         }
 2128 
 2129 done:
 2130         vie_advance(vie);
 2131 
 2132         return (0);
 2133 }
 2134 
 2135 static int
 2136 decode_sib(struct vie *vie)
 2137 {
 2138         uint8_t x;
 2139 
 2140         /* Proceed only if SIB byte is present */
 2141         if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
 2142                 return (0);
 2143 
 2144         if (vie_peek(vie, &x))
 2145                 return (-1);
 2146 
 2147         /* De-construct the SIB byte */
 2148         vie->ss = (x >> 6) & 0x3;
 2149         vie->index = (x >> 3) & 0x7;
 2150         vie->base = (x >> 0) & 0x7;
 2151 
 2152         /* Apply the REX prefix modifiers */
 2153         vie->index |= vie->rex_x << 3;
 2154         vie->base |= vie->rex_b << 3;
 2155 
 2156         switch (vie->mod) {
 2157         case VIE_MOD_INDIRECT_DISP8:
 2158                 vie->disp_bytes = 1;
 2159                 break;
 2160         case VIE_MOD_INDIRECT_DISP32:
 2161                 vie->disp_bytes = 4;
 2162                 break;
 2163         }
 2164 
 2165         if (vie->mod == VIE_MOD_INDIRECT &&
 2166             (vie->base == 5 || vie->base == 13)) {
 2167                 /*
 2168                  * Special case when base register is unused if mod = 0
 2169                  * and base = %rbp or %r13.
 2170                  *
 2171                  * Documented in:
 2172                  * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 2173                  * Table 2-5: Special Cases of REX Encodings
 2174                  */
 2175                 vie->disp_bytes = 4;
 2176         } else {
 2177                 vie->base_register = gpr_map[vie->base];
 2178         }
 2179 
 2180         /*
 2181          * All encodings of 'index' are valid except for %rsp (4).
 2182          *
 2183          * Documented in:
 2184          * Table 2-3: 32-bit Addressing Forms with the SIB Byte
 2185          * Table 2-5: Special Cases of REX Encodings
 2186          */
 2187         if (vie->index != 4)
 2188                 vie->index_register = gpr_map[vie->index];
 2189 
 2190         /* 'scale' makes sense only in the context of an index register */
 2191         if (vie->index_register < VM_REG_LAST)
 2192                 vie->scale = 1 << vie->ss;
 2193 
 2194         vie_advance(vie);
 2195 
 2196         return (0);
 2197 }
 2198 
 2199 static int
 2200 decode_displacement(struct vie *vie)
 2201 {
 2202         int n, i;
 2203         uint8_t x;
 2204 
 2205         union {
 2206                 char    buf[4];
 2207                 int8_t  signed8;
 2208                 int32_t signed32;
 2209         } u;
 2210 
 2211         if ((n = vie->disp_bytes) == 0)
 2212                 return (0);
 2213 
 2214         if (n != 1 && n != 4)
 2215                 panic("decode_displacement: invalid disp_bytes %d", n);
 2216 
 2217         for (i = 0; i < n; i++) {
 2218                 if (vie_peek(vie, &x))
 2219                         return (-1);
 2220 
 2221                 u.buf[i] = x;
 2222                 vie_advance(vie);
 2223         }
 2224 
 2225         if (n == 1)
 2226                 vie->displacement = u.signed8;          /* sign-extended */
 2227         else
 2228                 vie->displacement = u.signed32;         /* sign-extended */
 2229 
 2230         return (0);
 2231 }
 2232 
 2233 static int
 2234 decode_immediate(struct vie *vie)
 2235 {
 2236         int i, n;
 2237         uint8_t x;
 2238         union {
 2239                 char    buf[4];
 2240                 int8_t  signed8;
 2241                 int16_t signed16;
 2242                 int32_t signed32;
 2243         } u;
 2244 
 2245         /* Figure out immediate operand size (if any) */
 2246         if (vie->op.op_flags & VIE_OP_F_IMM) {
 2247                 /*
 2248                  * Section 2.2.1.5 "Immediates", Intel SDM:
 2249                  * In 64-bit mode the typical size of immediate operands
 2250                  * remains 32-bits. When the operand size if 64-bits, the
 2251                  * processor sign-extends all immediates to 64-bits prior
 2252                  * to their use.
 2253                  */
 2254                 if (vie->opsize == 4 || vie->opsize == 8)
 2255                         vie->imm_bytes = 4;
 2256                 else
 2257                         vie->imm_bytes = 2;
 2258         } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
 2259                 vie->imm_bytes = 1;
 2260         }
 2261 
 2262         if ((n = vie->imm_bytes) == 0)
 2263                 return (0);
 2264 
 2265         KASSERT(n == 1 || n == 2 || n == 4,
 2266             ("%s: invalid number of immediate bytes: %d", __func__, n));
 2267 
 2268         for (i = 0; i < n; i++) {
 2269                 if (vie_peek(vie, &x))
 2270                         return (-1);
 2271 
 2272                 u.buf[i] = x;
 2273                 vie_advance(vie);
 2274         }
 2275 
 2276         /* sign-extend the immediate value before use */
 2277         if (n == 1)
 2278                 vie->immediate = u.signed8;
 2279         else if (n == 2)
 2280                 vie->immediate = u.signed16;
 2281         else
 2282                 vie->immediate = u.signed32;
 2283 
 2284         return (0);
 2285 }
 2286 
 2287 static int
 2288 decode_moffset(struct vie *vie)
 2289 {
 2290         int i, n;
 2291         uint8_t x;
 2292         union {
 2293                 char    buf[8];
 2294                 uint64_t u64;
 2295         } u;
 2296 
 2297         if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
 2298                 return (0);
 2299 
 2300         /*
 2301          * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
 2302          * The memory offset size follows the address-size of the instruction.
 2303          */
 2304         n = vie->addrsize;
 2305         KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
 2306 
 2307         u.u64 = 0;
 2308         for (i = 0; i < n; i++) {
 2309                 if (vie_peek(vie, &x))
 2310                         return (-1);
 2311 
 2312                 u.buf[i] = x;
 2313                 vie_advance(vie);
 2314         }
 2315         vie->displacement = u.u64;
 2316         return (0);
 2317 }
 2318 
 2319 /*
 2320  * Verify that the 'guest linear address' provided as collateral of the nested
 2321  * page table fault matches with our instruction decoding.
 2322  */
 2323 static int
 2324 verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie,
 2325     enum vm_cpu_mode cpu_mode)
 2326 {
 2327         int error;
 2328         uint64_t base, segbase, idx, gla2;
 2329         enum vm_reg_name seg;
 2330         struct seg_desc desc;
 2331 
 2332         /* Skip 'gla' verification */
 2333         if (gla == VIE_INVALID_GLA)
 2334                 return (0);
 2335 
 2336         base = 0;
 2337         if (vie->base_register != VM_REG_LAST) {
 2338                 error = vm_get_register(vm, cpuid, vie->base_register, &base);
 2339                 if (error) {
 2340                         printf("verify_gla: error %d getting base reg %d\n",
 2341                                 error, vie->base_register);
 2342                         return (-1);
 2343                 }
 2344 
 2345                 /*
 2346                  * RIP-relative addressing starts from the following
 2347                  * instruction
 2348                  */
 2349                 if (vie->base_register == VM_REG_GUEST_RIP)
 2350                         base += vie->num_processed;
 2351         }
 2352 
 2353         idx = 0;
 2354         if (vie->index_register != VM_REG_LAST) {
 2355                 error = vm_get_register(vm, cpuid, vie->index_register, &idx);
 2356                 if (error) {
 2357                         printf("verify_gla: error %d getting index reg %d\n",
 2358                                 error, vie->index_register);
 2359                         return (-1);
 2360                 }
 2361         }
 2362 
 2363         /*
 2364          * From "Specifying a Segment Selector", Intel SDM, Vol 1
 2365          *
 2366          * In 64-bit mode, segmentation is generally (but not
 2367          * completely) disabled.  The exceptions are the FS and GS
 2368          * segments.
 2369          *
 2370          * In legacy IA-32 mode, when the ESP or EBP register is used
 2371          * as the base, the SS segment is the default segment.  For
 2372          * other data references, except when relative to stack or
 2373          * string destination the DS segment is the default.  These
 2374          * can be overridden to allow other segments to be accessed.
 2375          */
 2376         if (vie->segment_override)
 2377                 seg = vie->segment_register;
 2378         else if (vie->base_register == VM_REG_GUEST_RSP ||
 2379             vie->base_register == VM_REG_GUEST_RBP)
 2380                 seg = VM_REG_GUEST_SS;
 2381         else
 2382                 seg = VM_REG_GUEST_DS;
 2383         if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
 2384             seg != VM_REG_GUEST_GS) {
 2385                 segbase = 0;
 2386         } else {
 2387                 error = vm_get_seg_desc(vm, cpuid, seg, &desc);
 2388                 if (error) {
 2389                         printf("verify_gla: error %d getting segment"
 2390                                " descriptor %d", error,
 2391                                vie->segment_register);
 2392                         return (-1);
 2393                 }
 2394                 segbase = desc.base;
 2395         }
 2396 
 2397         gla2 = segbase + base + vie->scale * idx + vie->displacement;
 2398         gla2 &= size2mask[vie->addrsize];
 2399         if (gla != gla2) {
 2400                 printf("verify_gla mismatch: segbase(0x%0lx)"
 2401                        "base(0x%0lx), scale(%d), index(0x%0lx), "
 2402                        "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
 2403                        segbase, base, vie->scale, idx, vie->displacement,
 2404                        gla, gla2);
 2405                 return (-1);
 2406         }
 2407 
 2408         return (0);
 2409 }
 2410 
 2411 int
 2412 vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
 2413                        enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
 2414 {
 2415 
 2416         if (decode_prefixes(vie, cpu_mode, cs_d))
 2417                 return (-1);
 2418 
 2419         if (decode_opcode(vie))
 2420                 return (-1);
 2421 
 2422         if (decode_modrm(vie, cpu_mode))
 2423                 return (-1);
 2424 
 2425         if (decode_sib(vie))
 2426                 return (-1);
 2427 
 2428         if (decode_displacement(vie))
 2429                 return (-1);
 2430 
 2431         if (decode_immediate(vie))
 2432                 return (-1);
 2433 
 2434         if (decode_moffset(vie))
 2435                 return (-1);
 2436 
 2437         if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
 2438                 if (verify_gla(vm, cpuid, gla, vie, cpu_mode))
 2439                         return (-1);
 2440         }
 2441 
 2442         vie->decoded = 1;       /* success */
 2443 
 2444         return (0);
 2445 }
 2446 #endif  /* _KERNEL */

Cache object: cc588ddb5922addfe07c5471a421e08b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.