The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/amd64/vmm/x86.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2011 NetApp, Inc.
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  *
   26  * $FreeBSD: releng/11.0/sys/amd64/vmm/x86.c 282520 2015-05-06 05:40:20Z neel $
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD: releng/11.0/sys/amd64/vmm/x86.c 282520 2015-05-06 05:40:20Z neel $");
   31 
   32 #include <sys/param.h>
   33 #include <sys/pcpu.h>
   34 #include <sys/systm.h>
   35 #include <sys/sysctl.h>
   36 
   37 #include <machine/clock.h>
   38 #include <machine/cpufunc.h>
   39 #include <machine/md_var.h>
   40 #include <machine/segments.h>
   41 #include <machine/specialreg.h>
   42 
   43 #include <machine/vmm.h>
   44 
   45 #include "vmm_host.h"
   46 #include "vmm_ktr.h"
   47 #include "vmm_util.h"
   48 #include "x86.h"
   49 
   50 SYSCTL_DECL(_hw_vmm);
   51 static SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL);
   52 
   53 #define CPUID_VM_HIGH           0x40000000
   54 
   55 static const char bhyve_id[12] = "bhyve bhyve ";
   56 
   57 static uint64_t bhyve_xcpuids;
   58 SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0,
   59     "Number of times an unknown cpuid leaf was accessed");
   60 
   61 /*
   62  * The default CPU topology is a single thread per package.
   63  */
   64 static u_int threads_per_core = 1;
   65 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, threads_per_core, CTLFLAG_RDTUN,
   66     &threads_per_core, 0, NULL);
   67 
   68 static u_int cores_per_package = 1;
   69 SYSCTL_UINT(_hw_vmm_topology, OID_AUTO, cores_per_package, CTLFLAG_RDTUN,
   70     &cores_per_package, 0, NULL);
   71 
   72 static int cpuid_leaf_b = 1;
   73 SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN,
   74     &cpuid_leaf_b, 0, NULL);
   75 
   76 /*
   77  * Round up to the next power of two, if necessary, and then take log2.
   78  * Returns -1 if argument is zero.
   79  */
   80 static __inline int
   81 log2(u_int x)
   82 {
   83 
   84         return (fls(x << (1 - powerof2(x))) - 1);
   85 }
   86 
   87 int
   88 x86_emulate_cpuid(struct vm *vm, int vcpu_id,
   89                   uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
   90 {
   91         const struct xsave_limits *limits;
   92         uint64_t cr4;
   93         int error, enable_invpcid, level, width, x2apic_id;
   94         unsigned int func, regs[4], logical_cpus;
   95         enum x2apic_state x2apic_state;
   96 
   97         VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx);
   98 
   99         /*
  100          * Requests for invalid CPUID levels should map to the highest
  101          * available level instead.
  102          */
  103         if (cpu_exthigh != 0 && *eax >= 0x80000000) {
  104                 if (*eax > cpu_exthigh)
  105                         *eax = cpu_exthigh;
  106         } else if (*eax >= 0x40000000) {
  107                 if (*eax > CPUID_VM_HIGH)
  108                         *eax = CPUID_VM_HIGH;
  109         } else if (*eax > cpu_high) {
  110                 *eax = cpu_high;
  111         }
  112 
  113         func = *eax;
  114 
  115         /*
  116          * In general the approach used for CPU topology is to
  117          * advertise a flat topology where all CPUs are packages with
  118          * no multi-core or SMT.
  119          */
  120         switch (func) {
  121                 /*
  122                  * Pass these through to the guest
  123                  */
  124                 case CPUID_0000_0000:
  125                 case CPUID_0000_0002:
  126                 case CPUID_0000_0003:
  127                 case CPUID_8000_0000:
  128                 case CPUID_8000_0002:
  129                 case CPUID_8000_0003:
  130                 case CPUID_8000_0004:
  131                 case CPUID_8000_0006:
  132                         cpuid_count(*eax, *ecx, regs);
  133                         break;
  134                 case CPUID_8000_0008:
  135                         cpuid_count(*eax, *ecx, regs);
  136                         if (vmm_is_amd()) {
  137                                 /*
  138                                  * XXX this might appear silly because AMD
  139                                  * cpus don't have threads.
  140                                  *
  141                                  * However this matches the logical cpus as
  142                                  * advertised by leaf 0x1 and will work even
  143                                  * if the 'threads_per_core' tunable is set
  144                                  * incorrectly on an AMD host.
  145                                  */
  146                                 logical_cpus = threads_per_core *
  147                                     cores_per_package;
  148                                 regs[2] = logical_cpus - 1;
  149                         }
  150                         break;
  151 
  152                 case CPUID_8000_0001:
  153                         cpuid_count(*eax, *ecx, regs);
  154 
  155                         /*
  156                          * Hide SVM and Topology Extension features from guest.
  157                          */
  158                         regs[2] &= ~(AMDID2_SVM | AMDID2_TOPOLOGY);
  159 
  160                         /*
  161                          * Don't advertise extended performance counter MSRs
  162                          * to the guest.
  163                          */
  164                         regs[2] &= ~AMDID2_PCXC;
  165                         regs[2] &= ~AMDID2_PNXC;
  166                         regs[2] &= ~AMDID2_PTSCEL2I;
  167 
  168                         /*
  169                          * Don't advertise Instruction Based Sampling feature.
  170                          */
  171                         regs[2] &= ~AMDID2_IBS;
  172 
  173                         /* NodeID MSR not available */
  174                         regs[2] &= ~AMDID2_NODE_ID;
  175 
  176                         /* Don't advertise the OS visible workaround feature */
  177                         regs[2] &= ~AMDID2_OSVW;
  178 
  179                         /*
  180                          * Hide rdtscp/ia32_tsc_aux until we know how
  181                          * to deal with them.
  182                          */
  183                         regs[3] &= ~AMDID_RDTSCP;
  184                         break;
  185 
  186                 case CPUID_8000_0007:
  187                         /*
  188                          * AMD uses this leaf to advertise the processor's
  189                          * power monitoring and RAS capabilities. These
  190                          * features are hardware-specific and exposing
  191                          * them to a guest doesn't make a lot of sense.
  192                          *
  193                          * Intel uses this leaf only to advertise the
  194                          * "Invariant TSC" feature with all other bits
  195                          * being reserved (set to zero).
  196                          */
  197                         regs[0] = 0;
  198                         regs[1] = 0;
  199                         regs[2] = 0;
  200                         regs[3] = 0;
  201 
  202                         /*
  203                          * "Invariant TSC" can be advertised to the guest if:
  204                          * - host TSC frequency is invariant
  205                          * - host TSCs are synchronized across physical cpus
  206                          *
  207                          * XXX This still falls short because the vcpu
  208                          * can observe the TSC moving backwards as it
  209                          * migrates across physical cpus. But at least
  210                          * it should discourage the guest from using the
  211                          * TSC to keep track of time.
  212                          */
  213                         if (tsc_is_invariant && smp_tsc)
  214                                 regs[3] |= AMDPM_TSC_INVARIANT;
  215                         break;
  216 
  217                 case CPUID_0000_0001:
  218                         do_cpuid(1, regs);
  219 
  220                         error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
  221                         if (error) {
  222                                 panic("x86_emulate_cpuid: error %d "
  223                                       "fetching x2apic state", error);
  224                         }
  225 
  226                         /*
  227                          * Override the APIC ID only in ebx
  228                          */
  229                         regs[1] &= ~(CPUID_LOCAL_APIC_ID);
  230                         regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
  231 
  232                         /*
  233                          * Don't expose VMX, SpeedStep, TME or SMX capability.
  234                          * Advertise x2APIC capability and Hypervisor guest.
  235                          */
  236                         regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
  237                         regs[2] &= ~(CPUID2_SMX);
  238 
  239                         regs[2] |= CPUID2_HV;
  240 
  241                         if (x2apic_state != X2APIC_DISABLED)
  242                                 regs[2] |= CPUID2_X2APIC;
  243                         else
  244                                 regs[2] &= ~CPUID2_X2APIC;
  245 
  246                         /*
  247                          * Only advertise CPUID2_XSAVE in the guest if
  248                          * the host is using XSAVE.
  249                          */
  250                         if (!(regs[2] & CPUID2_OSXSAVE))
  251                                 regs[2] &= ~CPUID2_XSAVE;
  252 
  253                         /*
  254                          * If CPUID2_XSAVE is being advertised and the
  255                          * guest has set CR4_XSAVE, set
  256                          * CPUID2_OSXSAVE.
  257                          */
  258                         regs[2] &= ~CPUID2_OSXSAVE;
  259                         if (regs[2] & CPUID2_XSAVE) {
  260                                 error = vm_get_register(vm, vcpu_id,
  261                                     VM_REG_GUEST_CR4, &cr4);
  262                                 if (error)
  263                                         panic("x86_emulate_cpuid: error %d "
  264                                               "fetching %%cr4", error);
  265                                 if (cr4 & CR4_XSAVE)
  266                                         regs[2] |= CPUID2_OSXSAVE;
  267                         }
  268 
  269                         /*
  270                          * Hide monitor/mwait until we know how to deal with
  271                          * these instructions.
  272                          */
  273                         regs[2] &= ~CPUID2_MON;
  274 
  275                         /*
  276                          * Hide the performance and debug features.
  277                          */
  278                         regs[2] &= ~CPUID2_PDCM;
  279 
  280                         /*
  281                          * No TSC deadline support in the APIC yet
  282                          */
  283                         regs[2] &= ~CPUID2_TSCDLT;
  284 
  285                         /*
  286                          * Hide thermal monitoring
  287                          */
  288                         regs[3] &= ~(CPUID_ACPI | CPUID_TM);
  289 
  290                         /*
  291                          * Hide the debug store capability.
  292                          */
  293                         regs[3] &= ~CPUID_DS;
  294 
  295                         /*
  296                          * Advertise the Machine Check and MTRR capability.
  297                          *
  298                          * Some guest OSes (e.g. Windows) will not boot if
  299                          * these features are absent.
  300                          */
  301                         regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR);
  302 
  303                         logical_cpus = threads_per_core * cores_per_package;
  304                         regs[1] &= ~CPUID_HTT_CORES;
  305                         regs[1] |= (logical_cpus & 0xff) << 16;
  306                         regs[3] |= CPUID_HTT;
  307                         break;
  308 
  309                 case CPUID_0000_0004:
  310                         cpuid_count(*eax, *ecx, regs);
  311 
  312                         if (regs[0] || regs[1] || regs[2] || regs[3]) {
  313                                 regs[0] &= 0x3ff;
  314                                 regs[0] |= (cores_per_package - 1) << 26;
  315                                 /*
  316                                  * Cache topology:
  317                                  * - L1 and L2 are shared only by the logical
  318                                  *   processors in a single core.
  319                                  * - L3 and above are shared by all logical
  320                                  *   processors in the package.
  321                                  */
  322                                 logical_cpus = threads_per_core;
  323                                 level = (regs[0] >> 5) & 0x7;
  324                                 if (level >= 3)
  325                                         logical_cpus *= cores_per_package;
  326                                 regs[0] |= (logical_cpus - 1) << 14;
  327                         }
  328                         break;
  329 
  330                 case CPUID_0000_0007:
  331                         regs[0] = 0;
  332                         regs[1] = 0;
  333                         regs[2] = 0;
  334                         regs[3] = 0;
  335 
  336                         /* leaf 0 */
  337                         if (*ecx == 0) {
  338                                 cpuid_count(*eax, *ecx, regs);
  339 
  340                                 /* Only leaf 0 is supported */
  341                                 regs[0] = 0;
  342 
  343                                 /*
  344                                  * Expose known-safe features.
  345                                  */
  346                                 regs[1] &= (CPUID_STDEXT_FSGSBASE |
  347                                     CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE |
  348                                     CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 |
  349                                     CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM |
  350                                     CPUID_STDEXT_AVX512F |
  351                                     CPUID_STDEXT_AVX512PF |
  352                                     CPUID_STDEXT_AVX512ER |
  353                                     CPUID_STDEXT_AVX512CD);
  354                                 regs[2] = 0;
  355                                 regs[3] = 0;
  356 
  357                                 /* Advertise INVPCID if it is enabled. */
  358                                 error = vm_get_capability(vm, vcpu_id,
  359                                     VM_CAP_ENABLE_INVPCID, &enable_invpcid);
  360                                 if (error == 0 && enable_invpcid)
  361                                         regs[1] |= CPUID_STDEXT_INVPCID;
  362                         }
  363                         break;
  364 
  365                 case CPUID_0000_0006:
  366                         regs[0] = CPUTPM1_ARAT;
  367                         regs[1] = 0;
  368                         regs[2] = 0;
  369                         regs[3] = 0;
  370                         break;
  371 
  372                 case CPUID_0000_000A:
  373                         /*
  374                          * Handle the access, but report 0 for
  375                          * all options
  376                          */
  377                         regs[0] = 0;
  378                         regs[1] = 0;
  379                         regs[2] = 0;
  380                         regs[3] = 0;
  381                         break;
  382 
  383                 case CPUID_0000_000B:
  384                         /*
  385                          * Processor topology enumeration
  386                          */
  387                         if (*ecx == 0) {
  388                                 logical_cpus = threads_per_core;
  389                                 width = log2(logical_cpus);
  390                                 level = CPUID_TYPE_SMT;
  391                                 x2apic_id = vcpu_id;
  392                         }
  393 
  394                         if (*ecx == 1) {
  395                                 logical_cpus = threads_per_core *
  396                                     cores_per_package;
  397                                 width = log2(logical_cpus);
  398                                 level = CPUID_TYPE_CORE;
  399                                 x2apic_id = vcpu_id;
  400                         }
  401 
  402                         if (!cpuid_leaf_b || *ecx >= 2) {
  403                                 width = 0;
  404                                 logical_cpus = 0;
  405                                 level = 0;
  406                                 x2apic_id = 0;
  407                         }
  408 
  409                         regs[0] = width & 0x1f;
  410                         regs[1] = logical_cpus & 0xffff;
  411                         regs[2] = (level << 8) | (*ecx & 0xff);
  412                         regs[3] = x2apic_id;
  413                         break;
  414 
  415                 case CPUID_0000_000D:
  416                         limits = vmm_get_xsave_limits();
  417                         if (!limits->xsave_enabled) {
  418                                 regs[0] = 0;
  419                                 regs[1] = 0;
  420                                 regs[2] = 0;
  421                                 regs[3] = 0;
  422                                 break;
  423                         }
  424 
  425                         cpuid_count(*eax, *ecx, regs);
  426                         switch (*ecx) {
  427                         case 0:
  428                                 /*
  429                                  * Only permit the guest to use bits
  430                                  * that are active in the host in
  431                                  * %xcr0.  Also, claim that the
  432                                  * maximum save area size is
  433                                  * equivalent to the host's current
  434                                  * save area size.  Since this runs
  435                                  * "inside" of vmrun(), it runs with
  436                                  * the guest's xcr0, so the current
  437                                  * save area size is correct as-is.
  438                                  */
  439                                 regs[0] &= limits->xcr0_allowed;
  440                                 regs[2] = limits->xsave_max_size;
  441                                 regs[3] &= (limits->xcr0_allowed >> 32);
  442                                 break;
  443                         case 1:
  444                                 /* Only permit XSAVEOPT. */
  445                                 regs[0] &= CPUID_EXTSTATE_XSAVEOPT;
  446                                 regs[1] = 0;
  447                                 regs[2] = 0;
  448                                 regs[3] = 0;
  449                                 break;
  450                         default:
  451                                 /*
  452                                  * If the leaf is for a permitted feature,
  453                                  * pass through as-is, otherwise return
  454                                  * all zeroes.
  455                                  */
  456                                 if (!(limits->xcr0_allowed & (1ul << *ecx))) {
  457                                         regs[0] = 0;
  458                                         regs[1] = 0;
  459                                         regs[2] = 0;
  460                                         regs[3] = 0;
  461                                 }
  462                                 break;
  463                         }
  464                         break;
  465 
  466                 case 0x40000000:
  467                         regs[0] = CPUID_VM_HIGH;
  468                         bcopy(bhyve_id, &regs[1], 4);
  469                         bcopy(bhyve_id + 4, &regs[2], 4);
  470                         bcopy(bhyve_id + 8, &regs[3], 4);
  471                         break;
  472 
  473                 default:
  474                         /*
  475                          * The leaf value has already been clamped so
  476                          * simply pass this through, keeping count of
  477                          * how many unhandled leaf values have been seen.
  478                          */
  479                         atomic_add_long(&bhyve_xcpuids, 1);
  480                         cpuid_count(*eax, *ecx, regs);
  481                         break;
  482         }
  483 
  484         *eax = regs[0];
  485         *ebx = regs[1];
  486         *ecx = regs[2];
  487         *edx = regs[3];
  488 
  489         return (1);
  490 }
  491 
  492 bool
  493 vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap)
  494 {
  495         bool rv;
  496 
  497         KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d",
  498             __func__, cap));
  499 
  500         /*
  501          * Simply passthrough the capabilities of the host cpu for now.
  502          */
  503         rv = false;
  504         switch (cap) {
  505         case VCC_NO_EXECUTE:
  506                 if (amd_feature & AMDID_NX)
  507                         rv = true;
  508                 break;
  509         case VCC_FFXSR:
  510                 if (amd_feature & AMDID_FFXSR)
  511                         rv = true;
  512                 break;
  513         case VCC_TCE:
  514                 if (amd_feature2 & AMDID2_TCE)
  515                         rv = true;
  516                 break;
  517         default:
  518                 panic("%s: unknown vm_cpu_capability %d", __func__, cap);
  519         }
  520         return (rv);
  521 }

Cache object: 82d20bd559b793c0bc6e1e3791c73bd9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.