The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  *
   21  * $FreeBSD$
   22  */
   23 
   24 /*
   25  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
   26  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
   27  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
   28  */
   29 
   30 /*
   31  * DTrace - Dynamic Tracing for Solaris
   32  *
   33  * This is the implementation of the Solaris Dynamic Tracing framework
   34  * (DTrace).  The user-visible interface to DTrace is described at length in
   35  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
   36  * library, the in-kernel DTrace framework, and the DTrace providers are
   37  * described in the block comments in the <sys/dtrace.h> header file.  The
   38  * internal architecture of DTrace is described in the block comments in the
   39  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
   40  * implementation very much assume mastery of all of these sources; if one has
   41  * an unanswered question about the implementation, one should consult them
   42  * first.
   43  *
   44  * The functions here are ordered roughly as follows:
   45  *
   46  *   - Probe context functions
   47  *   - Probe hashing functions
   48  *   - Non-probe context utility functions
   49  *   - Matching functions
   50  *   - Provider-to-Framework API functions
   51  *   - Probe management functions
   52  *   - DIF object functions
   53  *   - Format functions
   54  *   - Predicate functions
   55  *   - ECB functions
   56  *   - Buffer functions
   57  *   - Enabling functions
   58  *   - DOF functions
   59  *   - Anonymous enabling functions
   60  *   - Consumer state functions
   61  *   - Helper functions
   62  *   - Hook functions
   63  *   - Driver cookbook functions
   64  *
   65  * Each group of functions begins with a block comment labelled the "DTrace
   66  * [Group] Functions", allowing one to find each block by searching forward
   67  * on capital-f functions.
   68  */
   69 #include <sys/errno.h>
   70 #include <sys/param.h>
   71 #include <sys/types.h>
   72 #ifndef illumos
   73 #include <sys/time.h>
   74 #endif
   75 #include <sys/stat.h>
   76 #include <sys/conf.h>
   77 #include <sys/systm.h>
   78 #include <sys/endian.h>
   79 #ifdef illumos
   80 #include <sys/ddi.h>
   81 #include <sys/sunddi.h>
   82 #endif
   83 #include <sys/cpuvar.h>
   84 #include <sys/kmem.h>
   85 #ifdef illumos
   86 #include <sys/strsubr.h>
   87 #endif
   88 #include <sys/sysmacros.h>
   89 #include <sys/dtrace_impl.h>
   90 #include <sys/atomic.h>
   91 #include <sys/cmn_err.h>
   92 #ifdef illumos
   93 #include <sys/mutex_impl.h>
   94 #include <sys/rwlock_impl.h>
   95 #endif
   96 #include <sys/ctf_api.h>
   97 #ifdef illumos
   98 #include <sys/panic.h>
   99 #include <sys/priv_impl.h>
  100 #endif
  101 #ifdef illumos
  102 #include <sys/cred_impl.h>
  103 #include <sys/procfs_isa.h>
  104 #endif
  105 #include <sys/taskq.h>
  106 #ifdef illumos
  107 #include <sys/mkdev.h>
  108 #include <sys/kdi.h>
  109 #endif
  110 #include <sys/zone.h>
  111 #include <sys/socket.h>
  112 #include <netinet/in.h>
  113 #include "strtolctype.h"
  114 
  115 /* FreeBSD includes: */
  116 #ifndef illumos
  117 #include <sys/callout.h>
  118 #include <sys/ctype.h>
  119 #include <sys/eventhandler.h>
  120 #include <sys/limits.h>
  121 #include <sys/linker.h>
  122 #include <sys/kdb.h>
  123 #include <sys/jail.h>
  124 #include <sys/kernel.h>
  125 #include <sys/malloc.h>
  126 #include <sys/lock.h>
  127 #include <sys/mutex.h>
  128 #include <sys/ptrace.h>
  129 #include <sys/random.h>
  130 #include <sys/rwlock.h>
  131 #include <sys/sx.h>
  132 #include <sys/sysctl.h>
  133 
  134 
  135 #include <sys/mount.h>
  136 #undef AT_UID
  137 #undef AT_GID
  138 #include <sys/vnode.h>
  139 #include <sys/cred.h>
  140 
  141 #include <sys/dtrace_bsd.h>
  142 
  143 #include <netinet/in.h>
  144 
  145 #include "dtrace_cddl.h"
  146 #include "dtrace_debug.c"
  147 #endif
  148 
  149 #include "dtrace_xoroshiro128_plus.h"
  150 
  151 /*
  152  * DTrace Tunable Variables
  153  *
  154  * The following variables may be tuned by adding a line to /etc/system that
  155  * includes both the name of the DTrace module ("dtrace") and the name of the
  156  * variable.  For example:
  157  *
  158  *   set dtrace:dtrace_destructive_disallow = 1
  159  *
  160  * In general, the only variables that one should be tuning this way are those
  161  * that affect system-wide DTrace behavior, and for which the default behavior
  162  * is undesirable.  Most of these variables are tunable on a per-consumer
  163  * basis using DTrace options, and need not be tuned on a system-wide basis.
  164  * When tuning these variables, avoid pathological values; while some attempt
  165  * is made to verify the integrity of these variables, they are not considered
  166  * part of the supported interface to DTrace, and they are therefore not
  167  * checked comprehensively.  Further, these variables should not be tuned
  168  * dynamically via "mdb -kw" or other means; they should only be tuned via
  169  * /etc/system.
  170  */
  171 int             dtrace_destructive_disallow = 0;
  172 #ifndef illumos
  173 /* Positive logic version of dtrace_destructive_disallow for loader tunable */
  174 int             dtrace_allow_destructive = 1;
  175 #endif
  176 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
  177 size_t          dtrace_difo_maxsize = (256 * 1024);
  178 dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
  179 size_t          dtrace_statvar_maxsize = (16 * 1024);
  180 size_t          dtrace_actions_max = (16 * 1024);
  181 size_t          dtrace_retain_max = 1024;
  182 dtrace_optval_t dtrace_helper_actions_max = 128;
  183 dtrace_optval_t dtrace_helper_providers_max = 32;
  184 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
  185 size_t          dtrace_strsize_default = 256;
  186 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
  187 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
  188 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
  189 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
  190 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
  191 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
  192 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
  193 dtrace_optval_t dtrace_nspec_default = 1;
  194 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
  195 dtrace_optval_t dtrace_stackframes_default = 20;
  196 dtrace_optval_t dtrace_ustackframes_default = 20;
  197 dtrace_optval_t dtrace_jstackframes_default = 50;
  198 dtrace_optval_t dtrace_jstackstrsize_default = 512;
  199 int             dtrace_msgdsize_max = 128;
  200 hrtime_t        dtrace_chill_max = MSEC2NSEC(500);              /* 500 ms */
  201 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
  202 int             dtrace_devdepth_max = 32;
  203 int             dtrace_err_verbose;
  204 hrtime_t        dtrace_deadman_interval = NANOSEC;
  205 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
  206 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
  207 hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
  208 #ifndef illumos
  209 int             dtrace_memstr_max = 4096;
  210 int             dtrace_bufsize_max_frac = 128;
  211 #endif
  212 
  213 /*
  214  * DTrace External Variables
  215  *
  216  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
  217  * available to DTrace consumers via the backtick (`) syntax.  One of these,
  218  * dtrace_zero, is made deliberately so:  it is provided as a source of
  219  * well-known, zero-filled memory.  While this variable is not documented,
  220  * it is used by some translators as an implementation detail.
  221  */
  222 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
  223 
  224 /*
  225  * DTrace Internal Variables
  226  */
  227 #ifdef illumos
  228 static dev_info_t       *dtrace_devi;           /* device info */
  229 #endif
  230 #ifdef illumos
  231 static vmem_t           *dtrace_arena;          /* probe ID arena */
  232 static vmem_t           *dtrace_minor;          /* minor number arena */
  233 #else
  234 static taskq_t          *dtrace_taskq;          /* task queue */
  235 static struct unrhdr    *dtrace_arena;          /* Probe ID number.     */
  236 #endif
  237 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
  238 static int              dtrace_nprobes;         /* number of probes */
  239 static dtrace_provider_t *dtrace_provider;      /* provider list */
  240 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
  241 static int              dtrace_opens;           /* number of opens */
  242 static int              dtrace_helpers;         /* number of helpers */
  243 static int              dtrace_getf;            /* number of unpriv getf()s */
  244 #ifdef illumos
  245 static void             *dtrace_softstate;      /* softstate pointer */
  246 #endif
  247 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
  248 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
  249 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
  250 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
  251 static int              dtrace_toxranges;       /* number of toxic ranges */
  252 static int              dtrace_toxranges_max;   /* size of toxic range array */
  253 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
  254 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
  255 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
  256 static kthread_t        *dtrace_panicked;       /* panicking thread */
  257 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
  258 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
  259 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
  260 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
  261 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
  262 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
  263 static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
  264 #ifndef illumos
  265 static struct mtx       dtrace_unr_mtx;
  266 MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
  267 static eventhandler_tag dtrace_kld_load_tag;
  268 static eventhandler_tag dtrace_kld_unload_try_tag;
  269 #endif
  270 
  271 /*
  272  * DTrace Locking
  273  * DTrace is protected by three (relatively coarse-grained) locks:
  274  *
  275  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
  276  *     including enabling state, probes, ECBs, consumer state, helper state,
  277  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
  278  *     probe context is lock-free -- synchronization is handled via the
  279  *     dtrace_sync() cross call mechanism.
  280  *
  281  * (2) dtrace_provider_lock is required when manipulating provider state, or
  282  *     when provider state must be held constant.
  283  *
  284  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
  285  *     when meta provider state must be held constant.
  286  *
  287  * The lock ordering between these three locks is dtrace_meta_lock before
  288  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
  289  * several places where dtrace_provider_lock is held by the framework as it
  290  * calls into the providers -- which then call back into the framework,
  291  * grabbing dtrace_lock.)
  292  *
  293  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
  294  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
  295  * role as a coarse-grained lock; it is acquired before both of these locks.
  296  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
  297  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
  298  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
  299  * acquired _between_ dtrace_provider_lock and dtrace_lock.
  300  */
  301 static kmutex_t         dtrace_lock;            /* probe state lock */
  302 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
  303 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
  304 
  305 #ifndef illumos
  306 /* XXX FreeBSD hacks. */
  307 #define cr_suid         cr_svuid
  308 #define cr_sgid         cr_svgid
  309 #define ipaddr_t        in_addr_t
  310 #define mod_modname     pathname
  311 #define vuprintf        vprintf
  312 #ifndef crgetzoneid
  313 #define crgetzoneid(_a)        0
  314 #endif
  315 #define ttoproc(_a)     ((_a)->td_proc)
  316 #define SNOCD           0
  317 #define CPU_ON_INTR(_a) 0
  318 
  319 #define PRIV_EFFECTIVE          (1 << 0)
  320 #define PRIV_DTRACE_KERNEL      (1 << 1)
  321 #define PRIV_DTRACE_PROC        (1 << 2)
  322 #define PRIV_DTRACE_USER        (1 << 3)
  323 #define PRIV_PROC_OWNER         (1 << 4)
  324 #define PRIV_PROC_ZONE          (1 << 5)
  325 #define PRIV_ALL                ~0
  326 
  327 SYSCTL_DECL(_debug_dtrace);
  328 SYSCTL_DECL(_kern_dtrace);
  329 #endif
  330 
  331 #ifdef illumos
  332 #define curcpu  CPU->cpu_id
  333 #endif
  334 
  335 
  336 /*
  337  * DTrace Provider Variables
  338  *
  339  * These are the variables relating to DTrace as a provider (that is, the
  340  * provider of the BEGIN, END, and ERROR probes).
  341  */
  342 static dtrace_pattr_t   dtrace_provider_attr = {
  343 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
  344 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
  345 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
  346 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
  347 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
  348 };
  349 
  350 static void
  351 dtrace_nullop(void)
  352 {}
  353 
  354 static dtrace_pops_t dtrace_provider_ops = {
  355         .dtps_provide = (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
  356         .dtps_provide_module =  (void (*)(void *, modctl_t *))dtrace_nullop,
  357         .dtps_enable =  (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
  358         .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
  359         .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
  360         .dtps_resume =  (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
  361         .dtps_getargdesc =      NULL,
  362         .dtps_getargval =       NULL,
  363         .dtps_usermode =        NULL,
  364         .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
  365 };
  366 
  367 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
  368 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
  369 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
  370 
  371 /*
  372  * DTrace Helper Tracing Variables
  373  *
  374  * These variables should be set dynamically to enable helper tracing.  The
  375  * only variables that should be set are dtrace_helptrace_enable (which should
  376  * be set to a non-zero value to allocate helper tracing buffers on the next
  377  * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
  378  * non-zero value to deallocate helper tracing buffers on the next close of
  379  * /dev/dtrace).  When (and only when) helper tracing is disabled, the
  380  * buffer size may also be set via dtrace_helptrace_bufsize.
  381  */
  382 int                     dtrace_helptrace_enable = 0;
  383 int                     dtrace_helptrace_disable = 0;
  384 int                     dtrace_helptrace_bufsize = 16 * 1024 * 1024;
  385 uint32_t                dtrace_helptrace_nlocals;
  386 static dtrace_helptrace_t *dtrace_helptrace_buffer;
  387 static uint32_t         dtrace_helptrace_next = 0;
  388 static int              dtrace_helptrace_wrapped = 0;
  389 
  390 /*
  391  * DTrace Error Hashing
  392  *
  393  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
  394  * table.  This is very useful for checking coverage of tests that are
  395  * expected to induce DIF or DOF processing errors, and may be useful for
  396  * debugging problems in the DIF code generator or in DOF generation .  The
  397  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
  398  */
  399 #ifdef DEBUG
  400 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
  401 static const char *dtrace_errlast;
  402 static kthread_t *dtrace_errthread;
  403 static kmutex_t dtrace_errlock;
  404 #endif
  405 
  406 /*
  407  * DTrace Macros and Constants
  408  *
  409  * These are various macros that are useful in various spots in the
  410  * implementation, along with a few random constants that have no meaning
  411  * outside of the implementation.  There is no real structure to this cpp
  412  * mishmash -- but is there ever?
  413  */
  414 #define DTRACE_HASHSTR(hash, probe)     \
  415         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
  416 
  417 #define DTRACE_HASHNEXT(hash, probe)    \
  418         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
  419 
  420 #define DTRACE_HASHPREV(hash, probe)    \
  421         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
  422 
  423 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
  424         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
  425             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
  426 
  427 #define DTRACE_AGGHASHSIZE_SLEW         17
  428 
  429 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
  430 
  431 /*
  432  * The key for a thread-local variable consists of the lower 61 bits of the
  433  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
  434  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
  435  * equal to a variable identifier.  This is necessary (but not sufficient) to
  436  * assure that global associative arrays never collide with thread-local
  437  * variables.  To guarantee that they cannot collide, we must also define the
  438  * order for keying dynamic variables.  That order is:
  439  *
  440  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
  441  *
  442  * Because the variable-key and the tls-key are in orthogonal spaces, there is
  443  * no way for a global variable key signature to match a thread-local key
  444  * signature.
  445  */
  446 #ifdef illumos
  447 #define DTRACE_TLS_THRKEY(where) { \
  448         uint_t intr = 0; \
  449         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
  450         for (; actv; actv >>= 1) \
  451                 intr++; \
  452         ASSERT(intr < (1 << 3)); \
  453         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
  454             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
  455 }
  456 #else
  457 #define DTRACE_TLS_THRKEY(where) { \
  458         solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
  459         uint_t intr = 0; \
  460         uint_t actv = _c->cpu_intr_actv; \
  461         for (; actv; actv >>= 1) \
  462                 intr++; \
  463         ASSERT(intr < (1 << 3)); \
  464         (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
  465             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
  466 }
  467 #endif
  468 
  469 #define DT_BSWAP_8(x)   ((x) & 0xff)
  470 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
  471 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
  472 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
  473 
  474 #define DT_MASK_LO 0x00000000FFFFFFFFULL
  475 
  476 #define DTRACE_STORE(type, tomax, offset, what) \
  477         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
  478 
  479 #if !defined(__x86) && !defined(__aarch64__)
  480 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
  481         if (addr & (size - 1)) {                                        \
  482                 *flags |= CPU_DTRACE_BADALIGN;                          \
  483                 cpu_core[curcpu].cpuc_dtrace_illval = addr;     \
  484                 return (0);                                             \
  485         }
  486 #else
  487 #define DTRACE_ALIGNCHECK(addr, size, flags)
  488 #endif
  489 
  490 /*
  491  * Test whether a range of memory starting at testaddr of size testsz falls
  492  * within the range of memory described by addr, sz.  We take care to avoid
  493  * problems with overflow and underflow of the unsigned quantities, and
  494  * disallow all negative sizes.  Ranges of size 0 are allowed.
  495  */
  496 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
  497         ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
  498         (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
  499         (testaddr) + (testsz) >= (testaddr))
  500 
  501 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)               \
  502 do {                                                                    \
  503         if ((remp) != NULL) {                                           \
  504                 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);    \
  505         }                                                               \
  506 } while (0)
  507 
  508 
  509 /*
  510  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
  511  * alloc_sz on the righthand side of the comparison in order to avoid overflow
  512  * or underflow in the comparison with it.  This is simpler than the INRANGE
  513  * check above, because we know that the dtms_scratch_ptr is valid in the
  514  * range.  Allocations of size zero are allowed.
  515  */
  516 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
  517         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
  518         (mstate)->dtms_scratch_ptr >= (alloc_sz))
  519 
  520 #define DTRACE_LOADFUNC(bits)                                           \
  521 /*CSTYLED*/                                                             \
  522 uint##bits##_t                                                          \
  523 dtrace_load##bits(uintptr_t addr)                                       \
  524 {                                                                       \
  525         size_t size = bits / NBBY;                                      \
  526         /*CSTYLED*/                                                     \
  527         uint##bits##_t rval;                                            \
  528         int i;                                                          \
  529         volatile uint16_t *flags = (volatile uint16_t *)                \
  530             &cpu_core[curcpu].cpuc_dtrace_flags;                        \
  531                                                                         \
  532         DTRACE_ALIGNCHECK(addr, size, flags);                           \
  533                                                                         \
  534         for (i = 0; i < dtrace_toxranges; i++) {                        \
  535                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
  536                         continue;                                       \
  537                                                                         \
  538                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
  539                         continue;                                       \
  540                                                                         \
  541                 /*                                                      \
  542                  * This address falls within a toxic region; return 0.  \
  543                  */                                                     \
  544                 *flags |= CPU_DTRACE_BADADDR;                           \
  545                 cpu_core[curcpu].cpuc_dtrace_illval = addr;             \
  546                 return (0);                                             \
  547         }                                                               \
  548                                                                         \
  549         *flags |= CPU_DTRACE_NOFAULT;                                   \
  550         /*CSTYLED*/                                                     \
  551         rval = *((volatile uint##bits##_t *)addr);                      \
  552         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
  553                                                                         \
  554         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
  555 }
  556 
  557 #ifdef _LP64
  558 #define dtrace_loadptr  dtrace_load64
  559 #else
  560 #define dtrace_loadptr  dtrace_load32
  561 #endif
  562 
  563 #define DTRACE_DYNHASH_FREE     0
  564 #define DTRACE_DYNHASH_SINK     1
  565 #define DTRACE_DYNHASH_VALID    2
  566 
  567 #define DTRACE_MATCH_NEXT       0
  568 #define DTRACE_MATCH_DONE       1
  569 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
  570 #define DTRACE_STATE_ALIGN      64
  571 
  572 #define DTRACE_FLAGS2FLT(flags)                                         \
  573         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
  574         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
  575         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
  576         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
  577         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
  578         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
  579         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
  580         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
  581         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
  582         DTRACEFLT_UNKNOWN)
  583 
  584 #define DTRACEACT_ISSTRING(act)                                         \
  585         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
  586         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
  587 
  588 /* Function prototype definitions: */
  589 static size_t dtrace_strlen(const char *, size_t);
  590 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
  591 static void dtrace_enabling_provide(dtrace_provider_t *);
  592 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
  593 static void dtrace_enabling_matchall(void);
  594 static void dtrace_enabling_reap(void);
  595 static dtrace_state_t *dtrace_anon_grab(void);
  596 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
  597     dtrace_state_t *, uint64_t, uint64_t);
  598 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
  599 static void dtrace_buffer_drop(dtrace_buffer_t *);
  600 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
  601 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
  602     dtrace_state_t *, dtrace_mstate_t *);
  603 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
  604     dtrace_optval_t);
  605 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
  606 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
  607 uint16_t dtrace_load16(uintptr_t);
  608 uint32_t dtrace_load32(uintptr_t);
  609 uint64_t dtrace_load64(uintptr_t);
  610 uint8_t dtrace_load8(uintptr_t);
  611 void dtrace_dynvar_clean(dtrace_dstate_t *);
  612 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
  613     size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
  614 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
  615 static int dtrace_priv_proc(dtrace_state_t *);
  616 static void dtrace_getf_barrier(void);
  617 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
  618     dtrace_mstate_t *, dtrace_vstate_t *);
  619 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
  620     dtrace_mstate_t *, dtrace_vstate_t *);
  621 
  622 /*
  623  * DTrace Probe Context Functions
  624  *
  625  * These functions are called from probe context.  Because probe context is
  626  * any context in which C may be called, arbitrarily locks may be held,
  627  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
  628  * As a result, functions called from probe context may only call other DTrace
  629  * support functions -- they may not interact at all with the system at large.
  630  * (Note that the ASSERT macro is made probe-context safe by redefining it in
  631  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
  632  * loads are to be performed from probe context, they _must_ be in terms of
  633  * the safe dtrace_load*() variants.
  634  *
  635  * Some functions in this block are not actually called from probe context;
  636  * for these functions, there will be a comment above the function reading
  637  * "Note:  not called from probe context."
  638  */
  639 void
  640 dtrace_panic(const char *format, ...)
  641 {
  642         va_list alist;
  643 
  644         va_start(alist, format);
  645 #ifdef __FreeBSD__
  646         vpanic(format, alist);
  647 #else
  648         dtrace_vpanic(format, alist);
  649 #endif
  650         va_end(alist);
  651 }
  652 
  653 int
  654 dtrace_assfail(const char *a, const char *f, int l)
  655 {
  656         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
  657 
  658         /*
  659          * We just need something here that even the most clever compiler
  660          * cannot optimize away.
  661          */
  662         return (a[(uintptr_t)f]);
  663 }
  664 
  665 /*
  666  * Atomically increment a specified error counter from probe context.
  667  */
  668 static void
  669 dtrace_error(uint32_t *counter)
  670 {
  671         /*
  672          * Most counters stored to in probe context are per-CPU counters.
  673          * However, there are some error conditions that are sufficiently
  674          * arcane that they don't merit per-CPU storage.  If these counters
  675          * are incremented concurrently on different CPUs, scalability will be
  676          * adversely affected -- but we don't expect them to be white-hot in a
  677          * correctly constructed enabling...
  678          */
  679         uint32_t oval, nval;
  680 
  681         do {
  682                 oval = *counter;
  683 
  684                 if ((nval = oval + 1) == 0) {
  685                         /*
  686                          * If the counter would wrap, set it to 1 -- assuring
  687                          * that the counter is never zero when we have seen
  688                          * errors.  (The counter must be 32-bits because we
  689                          * aren't guaranteed a 64-bit compare&swap operation.)
  690                          * To save this code both the infamy of being fingered
  691                          * by a priggish news story and the indignity of being
  692                          * the target of a neo-puritan witch trial, we're
  693                          * carefully avoiding any colorful description of the
  694                          * likelihood of this condition -- but suffice it to
  695                          * say that it is only slightly more likely than the
  696                          * overflow of predicate cache IDs, as discussed in
  697                          * dtrace_predicate_create().
  698                          */
  699                         nval = 1;
  700                 }
  701         } while (dtrace_cas32(counter, oval, nval) != oval);
  702 }
  703 
  704 /*
  705  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
  706  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
  707  */
  708 /* BEGIN CSTYLED */
  709 DTRACE_LOADFUNC(8)
  710 DTRACE_LOADFUNC(16)
  711 DTRACE_LOADFUNC(32)
  712 DTRACE_LOADFUNC(64)
  713 /* END CSTYLED */
  714 
  715 static int
  716 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
  717 {
  718         if (dest < mstate->dtms_scratch_base)
  719                 return (0);
  720 
  721         if (dest + size < dest)
  722                 return (0);
  723 
  724         if (dest + size > mstate->dtms_scratch_ptr)
  725                 return (0);
  726 
  727         return (1);
  728 }
  729 
  730 static int
  731 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
  732     dtrace_statvar_t **svars, int nsvars)
  733 {
  734         int i;
  735         size_t maxglobalsize, maxlocalsize;
  736 
  737         if (nsvars == 0)
  738                 return (0);
  739 
  740         maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
  741         maxlocalsize = maxglobalsize * NCPU;
  742 
  743         for (i = 0; i < nsvars; i++) {
  744                 dtrace_statvar_t *svar = svars[i];
  745                 uint8_t scope;
  746                 size_t size;
  747 
  748                 if (svar == NULL || (size = svar->dtsv_size) == 0)
  749                         continue;
  750 
  751                 scope = svar->dtsv_var.dtdv_scope;
  752 
  753                 /*
  754                  * We verify that our size is valid in the spirit of providing
  755                  * defense in depth:  we want to prevent attackers from using
  756                  * DTrace to escalate an orthogonal kernel heap corruption bug
  757                  * into the ability to store to arbitrary locations in memory.
  758                  */
  759                 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
  760                     (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
  761 
  762                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
  763                     svar->dtsv_size)) {
  764                         DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
  765                             svar->dtsv_size);
  766                         return (1);
  767                 }
  768         }
  769 
  770         return (0);
  771 }
  772 
  773 /*
  774  * Check to see if the address is within a memory region to which a store may
  775  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
  776  * region.  The caller of dtrace_canstore() is responsible for performing any
  777  * alignment checks that are needed before stores are actually executed.
  778  */
  779 static int
  780 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
  781     dtrace_vstate_t *vstate)
  782 {
  783         return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
  784 }
  785 
  786 /*
  787  * Implementation of dtrace_canstore which communicates the upper bound of the
  788  * allowed memory region.
  789  */
  790 static int
  791 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
  792     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
  793 {
  794         /*
  795          * First, check to see if the address is in scratch space...
  796          */
  797         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
  798             mstate->dtms_scratch_size)) {
  799                 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
  800                     mstate->dtms_scratch_size);
  801                 return (1);
  802         }
  803 
  804         /*
  805          * Now check to see if it's a dynamic variable.  This check will pick
  806          * up both thread-local variables and any global dynamically-allocated
  807          * variables.
  808          */
  809         if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
  810             vstate->dtvs_dynvars.dtds_size)) {
  811                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
  812                 uintptr_t base = (uintptr_t)dstate->dtds_base +
  813                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
  814                 uintptr_t chunkoffs;
  815                 dtrace_dynvar_t *dvar;
  816 
  817                 /*
  818                  * Before we assume that we can store here, we need to make
  819                  * sure that it isn't in our metadata -- storing to our
  820                  * dynamic variable metadata would corrupt our state.  For
  821                  * the range to not include any dynamic variable metadata,
  822                  * it must:
  823                  *
  824                  *      (1) Start above the hash table that is at the base of
  825                  *      the dynamic variable space
  826                  *
  827                  *      (2) Have a starting chunk offset that is beyond the
  828                  *      dtrace_dynvar_t that is at the base of every chunk
  829                  *
  830                  *      (3) Not span a chunk boundary
  831                  *
  832                  *      (4) Not be in the tuple space of a dynamic variable
  833                  *
  834                  */
  835                 if (addr < base)
  836                         return (0);
  837 
  838                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
  839 
  840                 if (chunkoffs < sizeof (dtrace_dynvar_t))
  841                         return (0);
  842 
  843                 if (chunkoffs + sz > dstate->dtds_chunksize)
  844                         return (0);
  845 
  846                 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
  847 
  848                 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
  849                         return (0);
  850 
  851                 if (chunkoffs < sizeof (dtrace_dynvar_t) +
  852                     ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
  853                         return (0);
  854 
  855                 DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
  856                 return (1);
  857         }
  858 
  859         /*
  860          * Finally, check the static local and global variables.  These checks
  861          * take the longest, so we perform them last.
  862          */
  863         if (dtrace_canstore_statvar(addr, sz, remain,
  864             vstate->dtvs_locals, vstate->dtvs_nlocals))
  865                 return (1);
  866 
  867         if (dtrace_canstore_statvar(addr, sz, remain,
  868             vstate->dtvs_globals, vstate->dtvs_nglobals))
  869                 return (1);
  870 
  871         return (0);
  872 }
  873 
  874 
  875 /*
  876  * Convenience routine to check to see if the address is within a memory
  877  * region in which a load may be issued given the user's privilege level;
  878  * if not, it sets the appropriate error flags and loads 'addr' into the
  879  * illegal value slot.
  880  *
  881  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
  882  * appropriate memory access protection.
  883  */
  884 static int
  885 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
  886     dtrace_vstate_t *vstate)
  887 {
  888         return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
  889 }
  890 
  891 /*
  892  * Implementation of dtrace_canload which communicates the uppoer bound of the
  893  * allowed memory region.
  894  */
  895 static int
  896 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
  897     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
  898 {
  899         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
  900         file_t *fp;
  901 
  902         /*
  903          * If we hold the privilege to read from kernel memory, then
  904          * everything is readable.
  905          */
  906         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
  907                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
  908                 return (1);
  909         }
  910 
  911         /*
  912          * You can obviously read that which you can store.
  913          */
  914         if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
  915                 return (1);
  916 
  917         /*
  918          * We're allowed to read from our own string table.
  919          */
  920         if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
  921             mstate->dtms_difo->dtdo_strlen)) {
  922                 DTRACE_RANGE_REMAIN(remain, addr,
  923                     mstate->dtms_difo->dtdo_strtab,
  924                     mstate->dtms_difo->dtdo_strlen);
  925                 return (1);
  926         }
  927 
  928         if (vstate->dtvs_state != NULL &&
  929             dtrace_priv_proc(vstate->dtvs_state)) {
  930                 proc_t *p;
  931 
  932                 /*
  933                  * When we have privileges to the current process, there are
  934                  * several context-related kernel structures that are safe to
  935                  * read, even absent the privilege to read from kernel memory.
  936                  * These reads are safe because these structures contain only
  937                  * state that (1) we're permitted to read, (2) is harmless or
  938                  * (3) contains pointers to additional kernel state that we're
  939                  * not permitted to read (and as such, do not present an
  940                  * opportunity for privilege escalation).  Finally (and
  941                  * critically), because of the nature of their relation with
  942                  * the current thread context, the memory associated with these
  943                  * structures cannot change over the duration of probe context,
  944                  * and it is therefore impossible for this memory to be
  945                  * deallocated and reallocated as something else while it's
  946                  * being operated upon.
  947                  */
  948                 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
  949                         DTRACE_RANGE_REMAIN(remain, addr, curthread,
  950                             sizeof (kthread_t));
  951                         return (1);
  952                 }
  953 
  954                 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
  955                     sz, curthread->t_procp, sizeof (proc_t))) {
  956                         DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
  957                             sizeof (proc_t));
  958                         return (1);
  959                 }
  960 
  961                 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
  962                     curthread->t_cred, sizeof (cred_t))) {
  963                         DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
  964                             sizeof (cred_t));
  965                         return (1);
  966                 }
  967 
  968 #ifdef illumos
  969                 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
  970                     &(p->p_pidp->pid_id), sizeof (pid_t))) {
  971                         DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
  972                             sizeof (pid_t));
  973                         return (1);
  974                 }
  975 
  976                 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
  977                     curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
  978                         DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
  979                             offsetof(cpu_t, cpu_pause_thread));
  980                         return (1);
  981                 }
  982 #endif
  983         }
  984 
  985         if ((fp = mstate->dtms_getf) != NULL) {
  986                 uintptr_t psz = sizeof (void *);
  987                 vnode_t *vp;
  988                 vnodeops_t *op;
  989 
  990                 /*
  991                  * When getf() returns a file_t, the enabling is implicitly
  992                  * granted the (transient) right to read the returned file_t
  993                  * as well as the v_path and v_op->vnop_name of the underlying
  994                  * vnode.  These accesses are allowed after a successful
  995                  * getf() because the members that they refer to cannot change
  996                  * once set -- and the barrier logic in the kernel's closef()
  997                  * path assures that the file_t and its referenced vode_t
  998                  * cannot themselves be stale (that is, it impossible for
  999                  * either dtms_getf itself or its f_vnode member to reference
 1000                  * freed memory).
 1001                  */
 1002                 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
 1003                         DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
 1004                         return (1);
 1005                 }
 1006 
 1007                 if ((vp = fp->f_vnode) != NULL) {
 1008                         size_t slen;
 1009 #ifdef illumos
 1010                         if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
 1011                                 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
 1012                                     psz);
 1013                                 return (1);
 1014                         }
 1015                         slen = strlen(vp->v_path) + 1;
 1016                         if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
 1017                                 DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
 1018                                     slen);
 1019                                 return (1);
 1020                         }
 1021 #endif
 1022 
 1023                         if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
 1024                                 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
 1025                                     psz);
 1026                                 return (1);
 1027                         }
 1028 
 1029 #ifdef illumos
 1030                         if ((op = vp->v_op) != NULL &&
 1031                             DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
 1032                                 DTRACE_RANGE_REMAIN(remain, addr,
 1033                                     &op->vnop_name, psz);
 1034                                 return (1);
 1035                         }
 1036 
 1037                         if (op != NULL && op->vnop_name != NULL &&
 1038                             DTRACE_INRANGE(addr, sz, op->vnop_name,
 1039                             (slen = strlen(op->vnop_name) + 1))) {
 1040                                 DTRACE_RANGE_REMAIN(remain, addr,
 1041                                     op->vnop_name, slen);
 1042                                 return (1);
 1043                         }
 1044 #endif
 1045                 }
 1046         }
 1047 
 1048         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 1049         *illval = addr;
 1050         return (0);
 1051 }
 1052 
 1053 /*
 1054  * Convenience routine to check to see if a given string is within a memory
 1055  * region in which a load may be issued given the user's privilege level;
 1056  * this exists so that we don't need to issue unnecessary dtrace_strlen()
 1057  * calls in the event that the user has all privileges.
 1058  */
 1059 static int
 1060 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
 1061     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 1062 {
 1063         size_t rsize;
 1064 
 1065         /*
 1066          * If we hold the privilege to read from kernel memory, then
 1067          * everything is readable.
 1068          */
 1069         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
 1070                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
 1071                 return (1);
 1072         }
 1073 
 1074         /*
 1075          * Even if the caller is uninterested in querying the remaining valid
 1076          * range, it is required to ensure that the access is allowed.
 1077          */
 1078         if (remain == NULL) {
 1079                 remain = &rsize;
 1080         }
 1081         if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
 1082                 size_t strsz;
 1083                 /*
 1084                  * Perform the strlen after determining the length of the
 1085                  * memory region which is accessible.  This prevents timing
 1086                  * information from being used to find NULs in memory which is
 1087                  * not accessible to the caller.
 1088                  */
 1089                 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
 1090                     MIN(sz, *remain));
 1091                 if (strsz <= *remain) {
 1092                         return (1);
 1093                 }
 1094         }
 1095 
 1096         return (0);
 1097 }
 1098 
 1099 /*
 1100  * Convenience routine to check to see if a given variable is within a memory
 1101  * region in which a load may be issued given the user's privilege level.
 1102  */
 1103 static int
 1104 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
 1105     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 1106 {
 1107         size_t sz;
 1108         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 1109 
 1110         /*
 1111          * Calculate the max size before performing any checks since even
 1112          * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
 1113          * return the max length via 'remain'.
 1114          */
 1115         if (type->dtdt_kind == DIF_TYPE_STRING) {
 1116                 dtrace_state_t *state = vstate->dtvs_state;
 1117 
 1118                 if (state != NULL) {
 1119                         sz = state->dts_options[DTRACEOPT_STRSIZE];
 1120                 } else {
 1121                         /*
 1122                          * In helper context, we have a NULL state; fall back
 1123                          * to using the system-wide default for the string size
 1124                          * in this case.
 1125                          */
 1126                         sz = dtrace_strsize_default;
 1127                 }
 1128         } else {
 1129                 sz = type->dtdt_size;
 1130         }
 1131 
 1132         /*
 1133          * If we hold the privilege to read from kernel memory, then
 1134          * everything is readable.
 1135          */
 1136         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
 1137                 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
 1138                 return (1);
 1139         }
 1140 
 1141         if (type->dtdt_kind == DIF_TYPE_STRING) {
 1142                 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
 1143                     vstate));
 1144         }
 1145         return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
 1146             vstate));
 1147 }
 1148 
 1149 /*
 1150  * Convert a string to a signed integer using safe loads.
 1151  *
 1152  * NOTE: This function uses various macros from strtolctype.h to manipulate
 1153  * digit values, etc -- these have all been checked to ensure they make
 1154  * no additional function calls.
 1155  */
 1156 static int64_t
 1157 dtrace_strtoll(char *input, int base, size_t limit)
 1158 {
 1159         uintptr_t pos = (uintptr_t)input;
 1160         int64_t val = 0;
 1161         int x;
 1162         boolean_t neg = B_FALSE;
 1163         char c, cc, ccc;
 1164         uintptr_t end = pos + limit;
 1165 
 1166         /*
 1167          * Consume any whitespace preceding digits.
 1168          */
 1169         while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
 1170                 pos++;
 1171 
 1172         /*
 1173          * Handle an explicit sign if one is present.
 1174          */
 1175         if (c == '-' || c == '+') {
 1176                 if (c == '-')
 1177                         neg = B_TRUE;
 1178                 c = dtrace_load8(++pos);
 1179         }
 1180 
 1181         /*
 1182          * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
 1183          * if present.
 1184          */
 1185         if (base == 16 && c == '' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
 1186             cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
 1187                 pos += 2;
 1188                 c = ccc;
 1189         }
 1190 
 1191         /*
 1192          * Read in contiguous digits until the first non-digit character.
 1193          */
 1194         for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
 1195             c = dtrace_load8(++pos))
 1196                 val = val * base + x;
 1197 
 1198         return (neg ? -val : val);
 1199 }
 1200 
 1201 /*
 1202  * Compare two strings using safe loads.
 1203  */
 1204 static int
 1205 dtrace_strncmp(char *s1, char *s2, size_t limit)
 1206 {
 1207         uint8_t c1, c2;
 1208         volatile uint16_t *flags;
 1209 
 1210         if (s1 == s2 || limit == 0)
 1211                 return (0);
 1212 
 1213         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
 1214 
 1215         do {
 1216                 if (s1 == NULL) {
 1217                         c1 = '\0';
 1218                 } else {
 1219                         c1 = dtrace_load8((uintptr_t)s1++);
 1220                 }
 1221 
 1222                 if (s2 == NULL) {
 1223                         c2 = '\0';
 1224                 } else {
 1225                         c2 = dtrace_load8((uintptr_t)s2++);
 1226                 }
 1227 
 1228                 if (c1 != c2)
 1229                         return (c1 - c2);
 1230         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
 1231 
 1232         return (0);
 1233 }
 1234 
 1235 /*
 1236  * Compute strlen(s) for a string using safe memory accesses.  The additional
 1237  * len parameter is used to specify a maximum length to ensure completion.
 1238  */
 1239 static size_t
 1240 dtrace_strlen(const char *s, size_t lim)
 1241 {
 1242         uint_t len;
 1243 
 1244         for (len = 0; len != lim; len++) {
 1245                 if (dtrace_load8((uintptr_t)s++) == '\0')
 1246                         break;
 1247         }
 1248 
 1249         return (len);
 1250 }
 1251 
 1252 /*
 1253  * Check if an address falls within a toxic region.
 1254  */
 1255 static int
 1256 dtrace_istoxic(uintptr_t kaddr, size_t size)
 1257 {
 1258         uintptr_t taddr, tsize;
 1259         int i;
 1260 
 1261         for (i = 0; i < dtrace_toxranges; i++) {
 1262                 taddr = dtrace_toxrange[i].dtt_base;
 1263                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
 1264 
 1265                 if (kaddr - taddr < tsize) {
 1266                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 1267                         cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
 1268                         return (1);
 1269                 }
 1270 
 1271                 if (taddr - kaddr < size) {
 1272                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 1273                         cpu_core[curcpu].cpuc_dtrace_illval = taddr;
 1274                         return (1);
 1275                 }
 1276         }
 1277 
 1278         return (0);
 1279 }
 1280 
 1281 /*
 1282  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
 1283  * memory specified by the DIF program.  The dst is assumed to be safe memory
 1284  * that we can store to directly because it is managed by DTrace.  As with
 1285  * standard bcopy, overlapping copies are handled properly.
 1286  */
 1287 static void
 1288 dtrace_bcopy(const void *src, void *dst, size_t len)
 1289 {
 1290         if (len != 0) {
 1291                 uint8_t *s1 = dst;
 1292                 const uint8_t *s2 = src;
 1293 
 1294                 if (s1 <= s2) {
 1295                         do {
 1296                                 *s1++ = dtrace_load8((uintptr_t)s2++);
 1297                         } while (--len != 0);
 1298                 } else {
 1299                         s2 += len;
 1300                         s1 += len;
 1301 
 1302                         do {
 1303                                 *--s1 = dtrace_load8((uintptr_t)--s2);
 1304                         } while (--len != 0);
 1305                 }
 1306         }
 1307 }
 1308 
 1309 /*
 1310  * Copy src to dst using safe memory accesses, up to either the specified
 1311  * length, or the point that a nul byte is encountered.  The src is assumed to
 1312  * be unsafe memory specified by the DIF program.  The dst is assumed to be
 1313  * safe memory that we can store to directly because it is managed by DTrace.
 1314  * Unlike dtrace_bcopy(), overlapping regions are not handled.
 1315  */
 1316 static void
 1317 dtrace_strcpy(const void *src, void *dst, size_t len)
 1318 {
 1319         if (len != 0) {
 1320                 uint8_t *s1 = dst, c;
 1321                 const uint8_t *s2 = src;
 1322 
 1323                 do {
 1324                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
 1325                 } while (--len != 0 && c != '\0');
 1326         }
 1327 }
 1328 
 1329 /*
 1330  * Copy src to dst, deriving the size and type from the specified (BYREF)
 1331  * variable type.  The src is assumed to be unsafe memory specified by the DIF
 1332  * program.  The dst is assumed to be DTrace variable memory that is of the
 1333  * specified type; we assume that we can store to directly.
 1334  */
 1335 static void
 1336 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
 1337 {
 1338         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 1339 
 1340         if (type->dtdt_kind == DIF_TYPE_STRING) {
 1341                 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
 1342         } else {
 1343                 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
 1344         }
 1345 }
 1346 
 1347 /*
 1348  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
 1349  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
 1350  * safe memory that we can access directly because it is managed by DTrace.
 1351  */
 1352 static int
 1353 dtrace_bcmp(const void *s1, const void *s2, size_t len)
 1354 {
 1355         volatile uint16_t *flags;
 1356 
 1357         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
 1358 
 1359         if (s1 == s2)
 1360                 return (0);
 1361 
 1362         if (s1 == NULL || s2 == NULL)
 1363                 return (1);
 1364 
 1365         if (s1 != s2 && len != 0) {
 1366                 const uint8_t *ps1 = s1;
 1367                 const uint8_t *ps2 = s2;
 1368 
 1369                 do {
 1370                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
 1371                                 return (1);
 1372                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
 1373         }
 1374         return (0);
 1375 }
 1376 
 1377 /*
 1378  * Zero the specified region using a simple byte-by-byte loop.  Note that this
 1379  * is for safe DTrace-managed memory only.
 1380  */
 1381 static void
 1382 dtrace_bzero(void *dst, size_t len)
 1383 {
 1384         uchar_t *cp;
 1385 
 1386         for (cp = dst; len != 0; len--)
 1387                 *cp++ = 0;
 1388 }
 1389 
 1390 static void
 1391 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
 1392 {
 1393         uint64_t result[2];
 1394 
 1395         result[0] = addend1[0] + addend2[0];
 1396         result[1] = addend1[1] + addend2[1] +
 1397             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
 1398 
 1399         sum[0] = result[0];
 1400         sum[1] = result[1];
 1401 }
 1402 
 1403 /*
 1404  * Shift the 128-bit value in a by b. If b is positive, shift left.
 1405  * If b is negative, shift right.
 1406  */
 1407 static void
 1408 dtrace_shift_128(uint64_t *a, int b)
 1409 {
 1410         uint64_t mask;
 1411 
 1412         if (b == 0)
 1413                 return;
 1414 
 1415         if (b < 0) {
 1416                 b = -b;
 1417                 if (b >= 64) {
 1418                         a[0] = a[1] >> (b - 64);
 1419                         a[1] = 0;
 1420                 } else {
 1421                         a[0] >>= b;
 1422                         mask = 1LL << (64 - b);
 1423                         mask -= 1;
 1424                         a[0] |= ((a[1] & mask) << (64 - b));
 1425                         a[1] >>= b;
 1426                 }
 1427         } else {
 1428                 if (b >= 64) {
 1429                         a[1] = a[0] << (b - 64);
 1430                         a[0] = 0;
 1431                 } else {
 1432                         a[1] <<= b;
 1433                         mask = a[0] >> (64 - b);
 1434                         a[1] |= mask;
 1435                         a[0] <<= b;
 1436                 }
 1437         }
 1438 }
 1439 
 1440 /*
 1441  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
 1442  * use native multiplication on those, and then re-combine into the
 1443  * resulting 128-bit value.
 1444  *
 1445  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
 1446  *     hi1 * hi2 << 64 +
 1447  *     hi1 * lo2 << 32 +
 1448  *     hi2 * lo1 << 32 +
 1449  *     lo1 * lo2
 1450  */
 1451 static void
 1452 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
 1453 {
 1454         uint64_t hi1, hi2, lo1, lo2;
 1455         uint64_t tmp[2];
 1456 
 1457         hi1 = factor1 >> 32;
 1458         hi2 = factor2 >> 32;
 1459 
 1460         lo1 = factor1 & DT_MASK_LO;
 1461         lo2 = factor2 & DT_MASK_LO;
 1462 
 1463         product[0] = lo1 * lo2;
 1464         product[1] = hi1 * hi2;
 1465 
 1466         tmp[0] = hi1 * lo2;
 1467         tmp[1] = 0;
 1468         dtrace_shift_128(tmp, 32);
 1469         dtrace_add_128(product, tmp, product);
 1470 
 1471         tmp[0] = hi2 * lo1;
 1472         tmp[1] = 0;
 1473         dtrace_shift_128(tmp, 32);
 1474         dtrace_add_128(product, tmp, product);
 1475 }
 1476 
 1477 /*
 1478  * This privilege check should be used by actions and subroutines to
 1479  * verify that the user credentials of the process that enabled the
 1480  * invoking ECB match the target credentials
 1481  */
 1482 static int
 1483 dtrace_priv_proc_common_user(dtrace_state_t *state)
 1484 {
 1485         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
 1486 
 1487         /*
 1488          * We should always have a non-NULL state cred here, since if cred
 1489          * is null (anonymous tracing), we fast-path bypass this routine.
 1490          */
 1491         ASSERT(s_cr != NULL);
 1492 
 1493         if ((cr = CRED()) != NULL &&
 1494             s_cr->cr_uid == cr->cr_uid &&
 1495             s_cr->cr_uid == cr->cr_ruid &&
 1496             s_cr->cr_uid == cr->cr_suid &&
 1497             s_cr->cr_gid == cr->cr_gid &&
 1498             s_cr->cr_gid == cr->cr_rgid &&
 1499             s_cr->cr_gid == cr->cr_sgid)
 1500                 return (1);
 1501 
 1502         return (0);
 1503 }
 1504 
 1505 /*
 1506  * This privilege check should be used by actions and subroutines to
 1507  * verify that the zone of the process that enabled the invoking ECB
 1508  * matches the target credentials
 1509  */
 1510 static int
 1511 dtrace_priv_proc_common_zone(dtrace_state_t *state)
 1512 {
 1513 #ifdef illumos
 1514         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
 1515 
 1516         /*
 1517          * We should always have a non-NULL state cred here, since if cred
 1518          * is null (anonymous tracing), we fast-path bypass this routine.
 1519          */
 1520         ASSERT(s_cr != NULL);
 1521 
 1522         if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
 1523                 return (1);
 1524 
 1525         return (0);
 1526 #else
 1527         return (1);
 1528 #endif
 1529 }
 1530 
 1531 /*
 1532  * This privilege check should be used by actions and subroutines to
 1533  * verify that the process has not setuid or changed credentials.
 1534  */
 1535 static int
 1536 dtrace_priv_proc_common_nocd(void)
 1537 {
 1538         proc_t *proc;
 1539 
 1540         if ((proc = ttoproc(curthread)) != NULL &&
 1541             !(proc->p_flag & SNOCD))
 1542                 return (1);
 1543 
 1544         return (0);
 1545 }
 1546 
 1547 static int
 1548 dtrace_priv_proc_destructive(dtrace_state_t *state)
 1549 {
 1550         int action = state->dts_cred.dcr_action;
 1551 
 1552         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
 1553             dtrace_priv_proc_common_zone(state) == 0)
 1554                 goto bad;
 1555 
 1556         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
 1557             dtrace_priv_proc_common_user(state) == 0)
 1558                 goto bad;
 1559 
 1560         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
 1561             dtrace_priv_proc_common_nocd() == 0)
 1562                 goto bad;
 1563 
 1564         return (1);
 1565 
 1566 bad:
 1567         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
 1568 
 1569         return (0);
 1570 }
 1571 
 1572 static int
 1573 dtrace_priv_proc_control(dtrace_state_t *state)
 1574 {
 1575         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
 1576                 return (1);
 1577 
 1578         if (dtrace_priv_proc_common_zone(state) &&
 1579             dtrace_priv_proc_common_user(state) &&
 1580             dtrace_priv_proc_common_nocd())
 1581                 return (1);
 1582 
 1583         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
 1584 
 1585         return (0);
 1586 }
 1587 
 1588 static int
 1589 dtrace_priv_proc(dtrace_state_t *state)
 1590 {
 1591         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
 1592                 return (1);
 1593 
 1594         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
 1595 
 1596         return (0);
 1597 }
 1598 
 1599 static int
 1600 dtrace_priv_kernel(dtrace_state_t *state)
 1601 {
 1602         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
 1603                 return (1);
 1604 
 1605         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
 1606 
 1607         return (0);
 1608 }
 1609 
 1610 static int
 1611 dtrace_priv_kernel_destructive(dtrace_state_t *state)
 1612 {
 1613         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
 1614                 return (1);
 1615 
 1616         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
 1617 
 1618         return (0);
 1619 }
 1620 
 1621 /*
 1622  * Determine if the dte_cond of the specified ECB allows for processing of
 1623  * the current probe to continue.  Note that this routine may allow continued
 1624  * processing, but with access(es) stripped from the mstate's dtms_access
 1625  * field.
 1626  */
 1627 static int
 1628 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
 1629     dtrace_ecb_t *ecb)
 1630 {
 1631         dtrace_probe_t *probe = ecb->dte_probe;
 1632         dtrace_provider_t *prov = probe->dtpr_provider;
 1633         dtrace_pops_t *pops = &prov->dtpv_pops;
 1634         int mode = DTRACE_MODE_NOPRIV_DROP;
 1635 
 1636         ASSERT(ecb->dte_cond);
 1637 
 1638 #ifdef illumos
 1639         if (pops->dtps_mode != NULL) {
 1640                 mode = pops->dtps_mode(prov->dtpv_arg,
 1641                     probe->dtpr_id, probe->dtpr_arg);
 1642 
 1643                 ASSERT((mode & DTRACE_MODE_USER) ||
 1644                     (mode & DTRACE_MODE_KERNEL));
 1645                 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
 1646                     (mode & DTRACE_MODE_NOPRIV_DROP));
 1647         }
 1648 
 1649         /*
 1650          * If the dte_cond bits indicate that this consumer is only allowed to
 1651          * see user-mode firings of this probe, call the provider's dtps_mode()
 1652          * entry point to check that the probe was fired while in a user
 1653          * context.  If that's not the case, use the policy specified by the
 1654          * provider to determine if we drop the probe or merely restrict
 1655          * operation.
 1656          */
 1657         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
 1658                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
 1659 
 1660                 if (!(mode & DTRACE_MODE_USER)) {
 1661                         if (mode & DTRACE_MODE_NOPRIV_DROP)
 1662                                 return (0);
 1663 
 1664                         mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
 1665                 }
 1666         }
 1667 #endif
 1668 
 1669         /*
 1670          * This is more subtle than it looks. We have to be absolutely certain
 1671          * that CRED() isn't going to change out from under us so it's only
 1672          * legit to examine that structure if we're in constrained situations.
 1673          * Currently, the only times we'll this check is if a non-super-user
 1674          * has enabled the profile or syscall providers -- providers that
 1675          * allow visibility of all processes. For the profile case, the check
 1676          * above will ensure that we're examining a user context.
 1677          */
 1678         if (ecb->dte_cond & DTRACE_COND_OWNER) {
 1679                 cred_t *cr;
 1680                 cred_t *s_cr = state->dts_cred.dcr_cred;
 1681                 proc_t *proc;
 1682 
 1683                 ASSERT(s_cr != NULL);
 1684 
 1685                 if ((cr = CRED()) == NULL ||
 1686                     s_cr->cr_uid != cr->cr_uid ||
 1687                     s_cr->cr_uid != cr->cr_ruid ||
 1688                     s_cr->cr_uid != cr->cr_suid ||
 1689                     s_cr->cr_gid != cr->cr_gid ||
 1690                     s_cr->cr_gid != cr->cr_rgid ||
 1691                     s_cr->cr_gid != cr->cr_sgid ||
 1692                     (proc = ttoproc(curthread)) == NULL ||
 1693                     (proc->p_flag & SNOCD)) {
 1694                         if (mode & DTRACE_MODE_NOPRIV_DROP)
 1695                                 return (0);
 1696 
 1697 #ifdef illumos
 1698                         mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
 1699 #endif
 1700                 }
 1701         }
 1702 
 1703 #ifdef illumos
 1704         /*
 1705          * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
 1706          * in our zone, check to see if our mode policy is to restrict rather
 1707          * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
 1708          * and DTRACE_ACCESS_ARGS
 1709          */
 1710         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
 1711                 cred_t *cr;
 1712                 cred_t *s_cr = state->dts_cred.dcr_cred;
 1713 
 1714                 ASSERT(s_cr != NULL);
 1715 
 1716                 if ((cr = CRED()) == NULL ||
 1717                     s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
 1718                         if (mode & DTRACE_MODE_NOPRIV_DROP)
 1719                                 return (0);
 1720 
 1721                         mstate->dtms_access &=
 1722                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
 1723                 }
 1724         }
 1725 #endif
 1726 
 1727         return (1);
 1728 }
 1729 
 1730 /*
 1731  * Note:  not called from probe context.  This function is called
 1732  * asynchronously (and at a regular interval) from outside of probe context to
 1733  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
 1734  * cleaning is explained in detail in <sys/dtrace_impl.h>.
 1735  */
 1736 void
 1737 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
 1738 {
 1739         dtrace_dynvar_t *dirty;
 1740         dtrace_dstate_percpu_t *dcpu;
 1741         dtrace_dynvar_t **rinsep;
 1742         int i, j, work = 0;
 1743 
 1744         for (i = 0; i < NCPU; i++) {
 1745                 dcpu = &dstate->dtds_percpu[i];
 1746                 rinsep = &dcpu->dtdsc_rinsing;
 1747 
 1748                 /*
 1749                  * If the dirty list is NULL, there is no dirty work to do.
 1750                  */
 1751                 if (dcpu->dtdsc_dirty == NULL)
 1752                         continue;
 1753 
 1754                 if (dcpu->dtdsc_rinsing != NULL) {
 1755                         /*
 1756                          * If the rinsing list is non-NULL, then it is because
 1757                          * this CPU was selected to accept another CPU's
 1758                          * dirty list -- and since that time, dirty buffers
 1759                          * have accumulated.  This is a highly unlikely
 1760                          * condition, but we choose to ignore the dirty
 1761                          * buffers -- they'll be picked up a future cleanse.
 1762                          */
 1763                         continue;
 1764                 }
 1765 
 1766                 if (dcpu->dtdsc_clean != NULL) {
 1767                         /*
 1768                          * If the clean list is non-NULL, then we're in a
 1769                          * situation where a CPU has done deallocations (we
 1770                          * have a non-NULL dirty list) but no allocations (we
 1771                          * also have a non-NULL clean list).  We can't simply
 1772                          * move the dirty list into the clean list on this
 1773                          * CPU, yet we also don't want to allow this condition
 1774                          * to persist, lest a short clean list prevent a
 1775                          * massive dirty list from being cleaned (which in
 1776                          * turn could lead to otherwise avoidable dynamic
 1777                          * drops).  To deal with this, we look for some CPU
 1778                          * with a NULL clean list, NULL dirty list, and NULL
 1779                          * rinsing list -- and then we borrow this CPU to
 1780                          * rinse our dirty list.
 1781                          */
 1782                         for (j = 0; j < NCPU; j++) {
 1783                                 dtrace_dstate_percpu_t *rinser;
 1784 
 1785                                 rinser = &dstate->dtds_percpu[j];
 1786 
 1787                                 if (rinser->dtdsc_rinsing != NULL)
 1788                                         continue;
 1789 
 1790                                 if (rinser->dtdsc_dirty != NULL)
 1791                                         continue;
 1792 
 1793                                 if (rinser->dtdsc_clean != NULL)
 1794                                         continue;
 1795 
 1796                                 rinsep = &rinser->dtdsc_rinsing;
 1797                                 break;
 1798                         }
 1799 
 1800                         if (j == NCPU) {
 1801                                 /*
 1802                                  * We were unable to find another CPU that
 1803                                  * could accept this dirty list -- we are
 1804                                  * therefore unable to clean it now.
 1805                                  */
 1806                                 dtrace_dynvar_failclean++;
 1807                                 continue;
 1808                         }
 1809                 }
 1810 
 1811                 work = 1;
 1812 
 1813                 /*
 1814                  * Atomically move the dirty list aside.
 1815                  */
 1816                 do {
 1817                         dirty = dcpu->dtdsc_dirty;
 1818 
 1819                         /*
 1820                          * Before we zap the dirty list, set the rinsing list.
 1821                          * (This allows for a potential assertion in
 1822                          * dtrace_dynvar():  if a free dynamic variable appears
 1823                          * on a hash chain, either the dirty list or the
 1824                          * rinsing list for some CPU must be non-NULL.)
 1825                          */
 1826                         *rinsep = dirty;
 1827                         dtrace_membar_producer();
 1828                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
 1829                     dirty, NULL) != dirty);
 1830         }
 1831 
 1832         if (!work) {
 1833                 /*
 1834                  * We have no work to do; we can simply return.
 1835                  */
 1836                 return;
 1837         }
 1838 
 1839         dtrace_sync();
 1840 
 1841         for (i = 0; i < NCPU; i++) {
 1842                 dcpu = &dstate->dtds_percpu[i];
 1843 
 1844                 if (dcpu->dtdsc_rinsing == NULL)
 1845                         continue;
 1846 
 1847                 /*
 1848                  * We are now guaranteed that no hash chain contains a pointer
 1849                  * into this dirty list; we can make it clean.
 1850                  */
 1851                 ASSERT(dcpu->dtdsc_clean == NULL);
 1852                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
 1853                 dcpu->dtdsc_rinsing = NULL;
 1854         }
 1855 
 1856         /*
 1857          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
 1858          * sure that all CPUs have seen all of the dtdsc_clean pointers.
 1859          * This prevents a race whereby a CPU incorrectly decides that
 1860          * the state should be something other than DTRACE_DSTATE_CLEAN
 1861          * after dtrace_dynvar_clean() has completed.
 1862          */
 1863         dtrace_sync();
 1864 
 1865         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
 1866 }
 1867 
 1868 /*
 1869  * Depending on the value of the op parameter, this function looks-up,
 1870  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
 1871  * allocation is requested, this function will return a pointer to a
 1872  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
 1873  * variable can be allocated.  If NULL is returned, the appropriate counter
 1874  * will be incremented.
 1875  */
 1876 dtrace_dynvar_t *
 1877 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
 1878     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
 1879     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
 1880 {
 1881         uint64_t hashval = DTRACE_DYNHASH_VALID;
 1882         dtrace_dynhash_t *hash = dstate->dtds_hash;
 1883         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
 1884         processorid_t me = curcpu, cpu = me;
 1885         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
 1886         size_t bucket, ksize;
 1887         size_t chunksize = dstate->dtds_chunksize;
 1888         uintptr_t kdata, lock, nstate;
 1889         uint_t i;
 1890 
 1891         ASSERT(nkeys != 0);
 1892 
 1893         /*
 1894          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
 1895          * algorithm.  For the by-value portions, we perform the algorithm in
 1896          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
 1897          * bit, and seems to have only a minute effect on distribution.  For
 1898          * the by-reference data, we perform "One-at-a-time" iterating (safely)
 1899          * over each referenced byte.  It's painful to do this, but it's much
 1900          * better than pathological hash distribution.  The efficacy of the
 1901          * hashing algorithm (and a comparison with other algorithms) may be
 1902          * found by running the ::dtrace_dynstat MDB dcmd.
 1903          */
 1904         for (i = 0; i < nkeys; i++) {
 1905                 if (key[i].dttk_size == 0) {
 1906                         uint64_t val = key[i].dttk_value;
 1907 
 1908                         hashval += (val >> 48) & 0xffff;
 1909                         hashval += (hashval << 10);
 1910                         hashval ^= (hashval >> 6);
 1911 
 1912                         hashval += (val >> 32) & 0xffff;
 1913                         hashval += (hashval << 10);
 1914                         hashval ^= (hashval >> 6);
 1915 
 1916                         hashval += (val >> 16) & 0xffff;
 1917                         hashval += (hashval << 10);
 1918                         hashval ^= (hashval >> 6);
 1919 
 1920                         hashval += val & 0xffff;
 1921                         hashval += (hashval << 10);
 1922                         hashval ^= (hashval >> 6);
 1923                 } else {
 1924                         /*
 1925                          * This is incredibly painful, but it beats the hell
 1926                          * out of the alternative.
 1927                          */
 1928                         uint64_t j, size = key[i].dttk_size;
 1929                         uintptr_t base = (uintptr_t)key[i].dttk_value;
 1930 
 1931                         if (!dtrace_canload(base, size, mstate, vstate))
 1932                                 break;
 1933 
 1934                         for (j = 0; j < size; j++) {
 1935                                 hashval += dtrace_load8(base + j);
 1936                                 hashval += (hashval << 10);
 1937                                 hashval ^= (hashval >> 6);
 1938                         }
 1939                 }
 1940         }
 1941 
 1942         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
 1943                 return (NULL);
 1944 
 1945         hashval += (hashval << 3);
 1946         hashval ^= (hashval >> 11);
 1947         hashval += (hashval << 15);
 1948 
 1949         /*
 1950          * There is a remote chance (ideally, 1 in 2^31) that our hashval
 1951          * comes out to be one of our two sentinel hash values.  If this
 1952          * actually happens, we set the hashval to be a value known to be a
 1953          * non-sentinel value.
 1954          */
 1955         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
 1956                 hashval = DTRACE_DYNHASH_VALID;
 1957 
 1958         /*
 1959          * Yes, it's painful to do a divide here.  If the cycle count becomes
 1960          * important here, tricks can be pulled to reduce it.  (However, it's
 1961          * critical that hash collisions be kept to an absolute minimum;
 1962          * they're much more painful than a divide.)  It's better to have a
 1963          * solution that generates few collisions and still keeps things
 1964          * relatively simple.
 1965          */
 1966         bucket = hashval % dstate->dtds_hashsize;
 1967 
 1968         if (op == DTRACE_DYNVAR_DEALLOC) {
 1969                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
 1970 
 1971                 for (;;) {
 1972                         while ((lock = *lockp) & 1)
 1973                                 continue;
 1974 
 1975                         if (dtrace_casptr((volatile void *)lockp,
 1976                             (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
 1977                                 break;
 1978                 }
 1979 
 1980                 dtrace_membar_producer();
 1981         }
 1982 
 1983 top:
 1984         prev = NULL;
 1985         lock = hash[bucket].dtdh_lock;
 1986 
 1987         dtrace_membar_consumer();
 1988 
 1989         start = hash[bucket].dtdh_chain;
 1990         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
 1991             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
 1992             op != DTRACE_DYNVAR_DEALLOC));
 1993 
 1994         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
 1995                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
 1996                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
 1997 
 1998                 if (dvar->dtdv_hashval != hashval) {
 1999                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
 2000                                 /*
 2001                                  * We've reached the sink, and therefore the
 2002                                  * end of the hash chain; we can kick out of
 2003                                  * the loop knowing that we have seen a valid
 2004                                  * snapshot of state.
 2005                                  */
 2006                                 ASSERT(dvar->dtdv_next == NULL);
 2007                                 ASSERT(dvar == &dtrace_dynhash_sink);
 2008                                 break;
 2009                         }
 2010 
 2011                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
 2012                                 /*
 2013                                  * We've gone off the rails:  somewhere along
 2014                                  * the line, one of the members of this hash
 2015                                  * chain was deleted.  Note that we could also
 2016                                  * detect this by simply letting this loop run
 2017                                  * to completion, as we would eventually hit
 2018                                  * the end of the dirty list.  However, we
 2019                                  * want to avoid running the length of the
 2020                                  * dirty list unnecessarily (it might be quite
 2021                                  * long), so we catch this as early as
 2022                                  * possible by detecting the hash marker.  In
 2023                                  * this case, we simply set dvar to NULL and
 2024                                  * break; the conditional after the loop will
 2025                                  * send us back to top.
 2026                                  */
 2027                                 dvar = NULL;
 2028                                 break;
 2029                         }
 2030 
 2031                         goto next;
 2032                 }
 2033 
 2034                 if (dtuple->dtt_nkeys != nkeys)
 2035                         goto next;
 2036 
 2037                 for (i = 0; i < nkeys; i++, dkey++) {
 2038                         if (dkey->dttk_size != key[i].dttk_size)
 2039                                 goto next; /* size or type mismatch */
 2040 
 2041                         if (dkey->dttk_size != 0) {
 2042                                 if (dtrace_bcmp(
 2043                                     (void *)(uintptr_t)key[i].dttk_value,
 2044                                     (void *)(uintptr_t)dkey->dttk_value,
 2045                                     dkey->dttk_size))
 2046                                         goto next;
 2047                         } else {
 2048                                 if (dkey->dttk_value != key[i].dttk_value)
 2049                                         goto next;
 2050                         }
 2051                 }
 2052 
 2053                 if (op != DTRACE_DYNVAR_DEALLOC)
 2054                         return (dvar);
 2055 
 2056                 ASSERT(dvar->dtdv_next == NULL ||
 2057                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
 2058 
 2059                 if (prev != NULL) {
 2060                         ASSERT(hash[bucket].dtdh_chain != dvar);
 2061                         ASSERT(start != dvar);
 2062                         ASSERT(prev->dtdv_next == dvar);
 2063                         prev->dtdv_next = dvar->dtdv_next;
 2064                 } else {
 2065                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
 2066                             start, dvar->dtdv_next) != start) {
 2067                                 /*
 2068                                  * We have failed to atomically swing the
 2069                                  * hash table head pointer, presumably because
 2070                                  * of a conflicting allocation on another CPU.
 2071                                  * We need to reread the hash chain and try
 2072                                  * again.
 2073                                  */
 2074                                 goto top;
 2075                         }
 2076                 }
 2077 
 2078                 dtrace_membar_producer();
 2079 
 2080                 /*
 2081                  * Now set the hash value to indicate that it's free.
 2082                  */
 2083                 ASSERT(hash[bucket].dtdh_chain != dvar);
 2084                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
 2085 
 2086                 dtrace_membar_producer();
 2087 
 2088                 /*
 2089                  * Set the next pointer to point at the dirty list, and
 2090                  * atomically swing the dirty pointer to the newly freed dvar.
 2091                  */
 2092                 do {
 2093                         next = dcpu->dtdsc_dirty;
 2094                         dvar->dtdv_next = next;
 2095                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
 2096 
 2097                 /*
 2098                  * Finally, unlock this hash bucket.
 2099                  */
 2100                 ASSERT(hash[bucket].dtdh_lock == lock);
 2101                 ASSERT(lock & 1);
 2102                 hash[bucket].dtdh_lock++;
 2103 
 2104                 return (NULL);
 2105 next:
 2106                 prev = dvar;
 2107                 continue;
 2108         }
 2109 
 2110         if (dvar == NULL) {
 2111                 /*
 2112                  * If dvar is NULL, it is because we went off the rails:
 2113                  * one of the elements that we traversed in the hash chain
 2114                  * was deleted while we were traversing it.  In this case,
 2115                  * we assert that we aren't doing a dealloc (deallocs lock
 2116                  * the hash bucket to prevent themselves from racing with
 2117                  * one another), and retry the hash chain traversal.
 2118                  */
 2119                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
 2120                 goto top;
 2121         }
 2122 
 2123         if (op != DTRACE_DYNVAR_ALLOC) {
 2124                 /*
 2125                  * If we are not to allocate a new variable, we want to
 2126                  * return NULL now.  Before we return, check that the value
 2127                  * of the lock word hasn't changed.  If it has, we may have
 2128                  * seen an inconsistent snapshot.
 2129                  */
 2130                 if (op == DTRACE_DYNVAR_NOALLOC) {
 2131                         if (hash[bucket].dtdh_lock != lock)
 2132                                 goto top;
 2133                 } else {
 2134                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
 2135                         ASSERT(hash[bucket].dtdh_lock == lock);
 2136                         ASSERT(lock & 1);
 2137                         hash[bucket].dtdh_lock++;
 2138                 }
 2139 
 2140                 return (NULL);
 2141         }
 2142 
 2143         /*
 2144          * We need to allocate a new dynamic variable.  The size we need is the
 2145          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
 2146          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
 2147          * the size of any referred-to data (dsize).  We then round the final
 2148          * size up to the chunksize for allocation.
 2149          */
 2150         for (ksize = 0, i = 0; i < nkeys; i++)
 2151                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
 2152 
 2153         /*
 2154          * This should be pretty much impossible, but could happen if, say,
 2155          * strange DIF specified the tuple.  Ideally, this should be an
 2156          * assertion and not an error condition -- but that requires that the
 2157          * chunksize calculation in dtrace_difo_chunksize() be absolutely
 2158          * bullet-proof.  (That is, it must not be able to be fooled by
 2159          * malicious DIF.)  Given the lack of backwards branches in DIF,
 2160          * solving this would presumably not amount to solving the Halting
 2161          * Problem -- but it still seems awfully hard.
 2162          */
 2163         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
 2164             ksize + dsize > chunksize) {
 2165                 dcpu->dtdsc_drops++;
 2166                 return (NULL);
 2167         }
 2168 
 2169         nstate = DTRACE_DSTATE_EMPTY;
 2170 
 2171         do {
 2172 retry:
 2173                 free = dcpu->dtdsc_free;
 2174 
 2175                 if (free == NULL) {
 2176                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
 2177                         void *rval;
 2178 
 2179                         if (clean == NULL) {
 2180                                 /*
 2181                                  * We're out of dynamic variable space on
 2182                                  * this CPU.  Unless we have tried all CPUs,
 2183                                  * we'll try to allocate from a different
 2184                                  * CPU.
 2185                                  */
 2186                                 switch (dstate->dtds_state) {
 2187                                 case DTRACE_DSTATE_CLEAN: {
 2188                                         void *sp = &dstate->dtds_state;
 2189 
 2190                                         if (++cpu >= NCPU)
 2191                                                 cpu = 0;
 2192 
 2193                                         if (dcpu->dtdsc_dirty != NULL &&
 2194                                             nstate == DTRACE_DSTATE_EMPTY)
 2195                                                 nstate = DTRACE_DSTATE_DIRTY;
 2196 
 2197                                         if (dcpu->dtdsc_rinsing != NULL)
 2198                                                 nstate = DTRACE_DSTATE_RINSING;
 2199 
 2200                                         dcpu = &dstate->dtds_percpu[cpu];
 2201 
 2202                                         if (cpu != me)
 2203                                                 goto retry;
 2204 
 2205                                         (void) dtrace_cas32(sp,
 2206                                             DTRACE_DSTATE_CLEAN, nstate);
 2207 
 2208                                         /*
 2209                                          * To increment the correct bean
 2210                                          * counter, take another lap.
 2211                                          */
 2212                                         goto retry;
 2213                                 }
 2214 
 2215                                 case DTRACE_DSTATE_DIRTY:
 2216                                         dcpu->dtdsc_dirty_drops++;
 2217                                         break;
 2218 
 2219                                 case DTRACE_DSTATE_RINSING:
 2220                                         dcpu->dtdsc_rinsing_drops++;
 2221                                         break;
 2222 
 2223                                 case DTRACE_DSTATE_EMPTY:
 2224                                         dcpu->dtdsc_drops++;
 2225                                         break;
 2226                                 }
 2227 
 2228                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
 2229                                 return (NULL);
 2230                         }
 2231 
 2232                         /*
 2233                          * The clean list appears to be non-empty.  We want to
 2234                          * move the clean list to the free list; we start by
 2235                          * moving the clean pointer aside.
 2236                          */
 2237                         if (dtrace_casptr(&dcpu->dtdsc_clean,
 2238                             clean, NULL) != clean) {
 2239                                 /*
 2240                                  * We are in one of two situations:
 2241                                  *
 2242                                  *  (a) The clean list was switched to the
 2243                                  *      free list by another CPU.
 2244                                  *
 2245                                  *  (b) The clean list was added to by the
 2246                                  *      cleansing cyclic.
 2247                                  *
 2248                                  * In either of these situations, we can
 2249                                  * just reattempt the free list allocation.
 2250                                  */
 2251                                 goto retry;
 2252                         }
 2253 
 2254                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
 2255 
 2256                         /*
 2257                          * Now we'll move the clean list to our free list.
 2258                          * It's impossible for this to fail:  the only way
 2259                          * the free list can be updated is through this
 2260                          * code path, and only one CPU can own the clean list.
 2261                          * Thus, it would only be possible for this to fail if
 2262                          * this code were racing with dtrace_dynvar_clean().
 2263                          * (That is, if dtrace_dynvar_clean() updated the clean
 2264                          * list, and we ended up racing to update the free
 2265                          * list.)  This race is prevented by the dtrace_sync()
 2266                          * in dtrace_dynvar_clean() -- which flushes the
 2267                          * owners of the clean lists out before resetting
 2268                          * the clean lists.
 2269                          */
 2270                         dcpu = &dstate->dtds_percpu[me];
 2271                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
 2272                         ASSERT(rval == NULL);
 2273                         goto retry;
 2274                 }
 2275 
 2276                 dvar = free;
 2277                 new_free = dvar->dtdv_next;
 2278         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
 2279 
 2280         /*
 2281          * We have now allocated a new chunk.  We copy the tuple keys into the
 2282          * tuple array and copy any referenced key data into the data space
 2283          * following the tuple array.  As we do this, we relocate dttk_value
 2284          * in the final tuple to point to the key data address in the chunk.
 2285          */
 2286         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
 2287         dvar->dtdv_data = (void *)(kdata + ksize);
 2288         dvar->dtdv_tuple.dtt_nkeys = nkeys;
 2289 
 2290         for (i = 0; i < nkeys; i++) {
 2291                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
 2292                 size_t kesize = key[i].dttk_size;
 2293 
 2294                 if (kesize != 0) {
 2295                         dtrace_bcopy(
 2296                             (const void *)(uintptr_t)key[i].dttk_value,
 2297                             (void *)kdata, kesize);
 2298                         dkey->dttk_value = kdata;
 2299                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
 2300                 } else {
 2301                         dkey->dttk_value = key[i].dttk_value;
 2302                 }
 2303 
 2304                 dkey->dttk_size = kesize;
 2305         }
 2306 
 2307         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
 2308         dvar->dtdv_hashval = hashval;
 2309         dvar->dtdv_next = start;
 2310 
 2311         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
 2312                 return (dvar);
 2313 
 2314         /*
 2315          * The cas has failed.  Either another CPU is adding an element to
 2316          * this hash chain, or another CPU is deleting an element from this
 2317          * hash chain.  The simplest way to deal with both of these cases
 2318          * (though not necessarily the most efficient) is to free our
 2319          * allocated block and re-attempt it all.  Note that the free is
 2320          * to the dirty list and _not_ to the free list.  This is to prevent
 2321          * races with allocators, above.
 2322          */
 2323         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
 2324 
 2325         dtrace_membar_producer();
 2326 
 2327         do {
 2328                 free = dcpu->dtdsc_dirty;
 2329                 dvar->dtdv_next = free;
 2330         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
 2331 
 2332         goto top;
 2333 }
 2334 
 2335 /*ARGSUSED*/
 2336 static void
 2337 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
 2338 {
 2339         if ((int64_t)nval < (int64_t)*oval)
 2340                 *oval = nval;
 2341 }
 2342 
 2343 /*ARGSUSED*/
 2344 static void
 2345 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
 2346 {
 2347         if ((int64_t)nval > (int64_t)*oval)
 2348                 *oval = nval;
 2349 }
 2350 
 2351 static void
 2352 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
 2353 {
 2354         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
 2355         int64_t val = (int64_t)nval;
 2356 
 2357         if (val < 0) {
 2358                 for (i = 0; i < zero; i++) {
 2359                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
 2360                                 quanta[i] += incr;
 2361                                 return;
 2362                         }
 2363                 }
 2364         } else {
 2365                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
 2366                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
 2367                                 quanta[i - 1] += incr;
 2368                                 return;
 2369                         }
 2370                 }
 2371 
 2372                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
 2373                 return;
 2374         }
 2375 
 2376         ASSERT(0);
 2377 }
 2378 
 2379 static void
 2380 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
 2381 {
 2382         uint64_t arg = *lquanta++;
 2383         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
 2384         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
 2385         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
 2386         int32_t val = (int32_t)nval, level;
 2387 
 2388         ASSERT(step != 0);
 2389         ASSERT(levels != 0);
 2390 
 2391         if (val < base) {
 2392                 /*
 2393                  * This is an underflow.
 2394                  */
 2395                 lquanta[0] += incr;
 2396                 return;
 2397         }
 2398 
 2399         level = (val - base) / step;
 2400 
 2401         if (level < levels) {
 2402                 lquanta[level + 1] += incr;
 2403                 return;
 2404         }
 2405 
 2406         /*
 2407          * This is an overflow.
 2408          */
 2409         lquanta[levels + 1] += incr;
 2410 }
 2411 
 2412 static int
 2413 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
 2414     uint16_t high, uint16_t nsteps, int64_t value)
 2415 {
 2416         int64_t this = 1, last, next;
 2417         int base = 1, order;
 2418 
 2419         ASSERT(factor <= nsteps);
 2420         ASSERT(nsteps % factor == 0);
 2421 
 2422         for (order = 0; order < low; order++)
 2423                 this *= factor;
 2424 
 2425         /*
 2426          * If our value is less than our factor taken to the power of the
 2427          * low order of magnitude, it goes into the zeroth bucket.
 2428          */
 2429         if (value < (last = this))
 2430                 return (0);
 2431 
 2432         for (this *= factor; order <= high; order++) {
 2433                 int nbuckets = this > nsteps ? nsteps : this;
 2434 
 2435                 if ((next = this * factor) < this) {
 2436                         /*
 2437                          * We should not generally get log/linear quantizations
 2438                          * with a high magnitude that allows 64-bits to
 2439                          * overflow, but we nonetheless protect against this
 2440                          * by explicitly checking for overflow, and clamping
 2441                          * our value accordingly.
 2442                          */
 2443                         value = this - 1;
 2444                 }
 2445 
 2446                 if (value < this) {
 2447                         /*
 2448                          * If our value lies within this order of magnitude,
 2449                          * determine its position by taking the offset within
 2450                          * the order of magnitude, dividing by the bucket
 2451                          * width, and adding to our (accumulated) base.
 2452                          */
 2453                         return (base + (value - last) / (this / nbuckets));
 2454                 }
 2455 
 2456                 base += nbuckets - (nbuckets / factor);
 2457                 last = this;
 2458                 this = next;
 2459         }
 2460 
 2461         /*
 2462          * Our value is greater than or equal to our factor taken to the
 2463          * power of one plus the high magnitude -- return the top bucket.
 2464          */
 2465         return (base);
 2466 }
 2467 
 2468 static void
 2469 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
 2470 {
 2471         uint64_t arg = *llquanta++;
 2472         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
 2473         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
 2474         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
 2475         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
 2476 
 2477         llquanta[dtrace_aggregate_llquantize_bucket(factor,
 2478             low, high, nsteps, nval)] += incr;
 2479 }
 2480 
 2481 /*ARGSUSED*/
 2482 static void
 2483 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
 2484 {
 2485         data[0]++;
 2486         data[1] += nval;
 2487 }
 2488 
 2489 /*ARGSUSED*/
 2490 static void
 2491 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
 2492 {
 2493         int64_t snval = (int64_t)nval;
 2494         uint64_t tmp[2];
 2495 
 2496         data[0]++;
 2497         data[1] += nval;
 2498 
 2499         /*
 2500          * What we want to say here is:
 2501          *
 2502          * data[2] += nval * nval;
 2503          *
 2504          * But given that nval is 64-bit, we could easily overflow, so
 2505          * we do this as 128-bit arithmetic.
 2506          */
 2507         if (snval < 0)
 2508                 snval = -snval;
 2509 
 2510         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
 2511         dtrace_add_128(data + 2, tmp, data + 2);
 2512 }
 2513 
 2514 /*ARGSUSED*/
 2515 static void
 2516 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
 2517 {
 2518         *oval = *oval + 1;
 2519 }
 2520 
 2521 /*ARGSUSED*/
 2522 static void
 2523 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
 2524 {
 2525         *oval += nval;
 2526 }
 2527 
 2528 /*
 2529  * Aggregate given the tuple in the principal data buffer, and the aggregating
 2530  * action denoted by the specified dtrace_aggregation_t.  The aggregation
 2531  * buffer is specified as the buf parameter.  This routine does not return
 2532  * failure; if there is no space in the aggregation buffer, the data will be
 2533  * dropped, and a corresponding counter incremented.
 2534  */
 2535 static void
 2536 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
 2537     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
 2538 {
 2539         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
 2540         uint32_t i, ndx, size, fsize;
 2541         uint32_t align = sizeof (uint64_t) - 1;
 2542         dtrace_aggbuffer_t *agb;
 2543         dtrace_aggkey_t *key;
 2544         uint32_t hashval = 0, limit, isstr;
 2545         caddr_t tomax, data, kdata;
 2546         dtrace_actkind_t action;
 2547         dtrace_action_t *act;
 2548         uintptr_t offs;
 2549 
 2550         if (buf == NULL)
 2551                 return;
 2552 
 2553         if (!agg->dtag_hasarg) {
 2554                 /*
 2555                  * Currently, only quantize() and lquantize() take additional
 2556                  * arguments, and they have the same semantics:  an increment
 2557                  * value that defaults to 1 when not present.  If additional
 2558                  * aggregating actions take arguments, the setting of the
 2559                  * default argument value will presumably have to become more
 2560                  * sophisticated...
 2561                  */
 2562                 arg = 1;
 2563         }
 2564 
 2565         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
 2566         size = rec->dtrd_offset - agg->dtag_base;
 2567         fsize = size + rec->dtrd_size;
 2568 
 2569         ASSERT(dbuf->dtb_tomax != NULL);
 2570         data = dbuf->dtb_tomax + offset + agg->dtag_base;
 2571 
 2572         if ((tomax = buf->dtb_tomax) == NULL) {
 2573                 dtrace_buffer_drop(buf);
 2574                 return;
 2575         }
 2576 
 2577         /*
 2578          * The metastructure is always at the bottom of the buffer.
 2579          */
 2580         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
 2581             sizeof (dtrace_aggbuffer_t));
 2582 
 2583         if (buf->dtb_offset == 0) {
 2584                 /*
 2585                  * We just kludge up approximately 1/8th of the size to be
 2586                  * buckets.  If this guess ends up being routinely
 2587                  * off-the-mark, we may need to dynamically readjust this
 2588                  * based on past performance.
 2589                  */
 2590                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
 2591 
 2592                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
 2593                     (uintptr_t)tomax || hashsize == 0) {
 2594                         /*
 2595                          * We've been given a ludicrously small buffer;
 2596                          * increment our drop count and leave.
 2597                          */
 2598                         dtrace_buffer_drop(buf);
 2599                         return;
 2600                 }
 2601 
 2602                 /*
 2603                  * And now, a pathetic attempt to try to get a an odd (or
 2604                  * perchance, a prime) hash size for better hash distribution.
 2605                  */
 2606                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
 2607                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
 2608 
 2609                 agb->dtagb_hashsize = hashsize;
 2610                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
 2611                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
 2612                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
 2613 
 2614                 for (i = 0; i < agb->dtagb_hashsize; i++)
 2615                         agb->dtagb_hash[i] = NULL;
 2616         }
 2617 
 2618         ASSERT(agg->dtag_first != NULL);
 2619         ASSERT(agg->dtag_first->dta_intuple);
 2620 
 2621         /*
 2622          * Calculate the hash value based on the key.  Note that we _don't_
 2623          * include the aggid in the hashing (but we will store it as part of
 2624          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
 2625          * algorithm: a simple, quick algorithm that has no known funnels, and
 2626          * gets good distribution in practice.  The efficacy of the hashing
 2627          * algorithm (and a comparison with other algorithms) may be found by
 2628          * running the ::dtrace_aggstat MDB dcmd.
 2629          */
 2630         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
 2631                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
 2632                 limit = i + act->dta_rec.dtrd_size;
 2633                 ASSERT(limit <= size);
 2634                 isstr = DTRACEACT_ISSTRING(act);
 2635 
 2636                 for (; i < limit; i++) {
 2637                         hashval += data[i];
 2638                         hashval += (hashval << 10);
 2639                         hashval ^= (hashval >> 6);
 2640 
 2641                         if (isstr && data[i] == '\0')
 2642                                 break;
 2643                 }
 2644         }
 2645 
 2646         hashval += (hashval << 3);
 2647         hashval ^= (hashval >> 11);
 2648         hashval += (hashval << 15);
 2649 
 2650         /*
 2651          * Yes, the divide here is expensive -- but it's generally the least
 2652          * of the performance issues given the amount of data that we iterate
 2653          * over to compute hash values, compare data, etc.
 2654          */
 2655         ndx = hashval % agb->dtagb_hashsize;
 2656 
 2657         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
 2658                 ASSERT((caddr_t)key >= tomax);
 2659                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
 2660 
 2661                 if (hashval != key->dtak_hashval || key->dtak_size != size)
 2662                         continue;
 2663 
 2664                 kdata = key->dtak_data;
 2665                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
 2666 
 2667                 for (act = agg->dtag_first; act->dta_intuple;
 2668                     act = act->dta_next) {
 2669                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
 2670                         limit = i + act->dta_rec.dtrd_size;
 2671                         ASSERT(limit <= size);
 2672                         isstr = DTRACEACT_ISSTRING(act);
 2673 
 2674                         for (; i < limit; i++) {
 2675                                 if (kdata[i] != data[i])
 2676                                         goto next;
 2677 
 2678                                 if (isstr && data[i] == '\0')
 2679                                         break;
 2680                         }
 2681                 }
 2682 
 2683                 if (action != key->dtak_action) {
 2684                         /*
 2685                          * We are aggregating on the same value in the same
 2686                          * aggregation with two different aggregating actions.
 2687                          * (This should have been picked up in the compiler,
 2688                          * so we may be dealing with errant or devious DIF.)
 2689                          * This is an error condition; we indicate as much,
 2690                          * and return.
 2691                          */
 2692                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 2693                         return;
 2694                 }
 2695 
 2696                 /*
 2697                  * This is a hit:  we need to apply the aggregator to
 2698                  * the value at this key.
 2699                  */
 2700                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
 2701                 return;
 2702 next:
 2703                 continue;
 2704         }
 2705 
 2706         /*
 2707          * We didn't find it.  We need to allocate some zero-filled space,
 2708          * link it into the hash table appropriately, and apply the aggregator
 2709          * to the (zero-filled) value.
 2710          */
 2711         offs = buf->dtb_offset;
 2712         while (offs & (align - 1))
 2713                 offs += sizeof (uint32_t);
 2714 
 2715         /*
 2716          * If we don't have enough room to both allocate a new key _and_
 2717          * its associated data, increment the drop count and return.
 2718          */
 2719         if ((uintptr_t)tomax + offs + fsize >
 2720             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
 2721                 dtrace_buffer_drop(buf);
 2722                 return;
 2723         }
 2724 
 2725         /*CONSTCOND*/
 2726         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
 2727         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
 2728         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
 2729 
 2730         key->dtak_data = kdata = tomax + offs;
 2731         buf->dtb_offset = offs + fsize;
 2732 
 2733         /*
 2734          * Now copy the data across.
 2735          */
 2736         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
 2737 
 2738         for (i = sizeof (dtrace_aggid_t); i < size; i++)
 2739                 kdata[i] = data[i];
 2740 
 2741         /*
 2742          * Because strings are not zeroed out by default, we need to iterate
 2743          * looking for actions that store strings, and we need to explicitly
 2744          * pad these strings out with zeroes.
 2745          */
 2746         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
 2747                 int nul;
 2748 
 2749                 if (!DTRACEACT_ISSTRING(act))
 2750                         continue;
 2751 
 2752                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
 2753                 limit = i + act->dta_rec.dtrd_size;
 2754                 ASSERT(limit <= size);
 2755 
 2756                 for (nul = 0; i < limit; i++) {
 2757                         if (nul) {
 2758                                 kdata[i] = '\0';
 2759                                 continue;
 2760                         }
 2761 
 2762                         if (data[i] != '\0')
 2763                                 continue;
 2764 
 2765                         nul = 1;
 2766                 }
 2767         }
 2768 
 2769         for (i = size; i < fsize; i++)
 2770                 kdata[i] = 0;
 2771 
 2772         key->dtak_hashval = hashval;
 2773         key->dtak_size = size;
 2774         key->dtak_action = action;
 2775         key->dtak_next = agb->dtagb_hash[ndx];
 2776         agb->dtagb_hash[ndx] = key;
 2777 
 2778         /*
 2779          * Finally, apply the aggregator.
 2780          */
 2781         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
 2782         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
 2783 }
 2784 
 2785 /*
 2786  * Given consumer state, this routine finds a speculation in the INACTIVE
 2787  * state and transitions it into the ACTIVE state.  If there is no speculation
 2788  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
 2789  * incremented -- it is up to the caller to take appropriate action.
 2790  */
 2791 static int
 2792 dtrace_speculation(dtrace_state_t *state)
 2793 {
 2794         int i = 0;
 2795         dtrace_speculation_state_t curstate;
 2796         uint32_t *stat = &state->dts_speculations_unavail, count;
 2797 
 2798         while (i < state->dts_nspeculations) {
 2799                 dtrace_speculation_t *spec = &state->dts_speculations[i];
 2800 
 2801                 curstate = spec->dtsp_state;
 2802 
 2803                 if (curstate != DTRACESPEC_INACTIVE) {
 2804                         if (curstate == DTRACESPEC_COMMITTINGMANY ||
 2805                             curstate == DTRACESPEC_COMMITTING ||
 2806                             curstate == DTRACESPEC_DISCARDING)
 2807                                 stat = &state->dts_speculations_busy;
 2808                         i++;
 2809                         continue;
 2810                 }
 2811 
 2812                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
 2813                     curstate, DTRACESPEC_ACTIVE) == curstate)
 2814                         return (i + 1);
 2815         }
 2816 
 2817         /*
 2818          * We couldn't find a speculation.  If we found as much as a single
 2819          * busy speculation buffer, we'll attribute this failure as "busy"
 2820          * instead of "unavail".
 2821          */
 2822         do {
 2823                 count = *stat;
 2824         } while (dtrace_cas32(stat, count, count + 1) != count);
 2825 
 2826         return (0);
 2827 }
 2828 
 2829 /*
 2830  * This routine commits an active speculation.  If the specified speculation
 2831  * is not in a valid state to perform a commit(), this routine will silently do
 2832  * nothing.  The state of the specified speculation is transitioned according
 2833  * to the state transition diagram outlined in <sys/dtrace_impl.h>
 2834  */
 2835 static void
 2836 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
 2837     dtrace_specid_t which)
 2838 {
 2839         dtrace_speculation_t *spec;
 2840         dtrace_buffer_t *src, *dest;
 2841         uintptr_t daddr, saddr, dlimit, slimit;
 2842         dtrace_speculation_state_t curstate, new = 0;
 2843         intptr_t offs;
 2844         uint64_t timestamp;
 2845 
 2846         if (which == 0)
 2847                 return;
 2848 
 2849         if (which > state->dts_nspeculations) {
 2850                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
 2851                 return;
 2852         }
 2853 
 2854         spec = &state->dts_speculations[which - 1];
 2855         src = &spec->dtsp_buffer[cpu];
 2856         dest = &state->dts_buffer[cpu];
 2857 
 2858         do {
 2859                 curstate = spec->dtsp_state;
 2860 
 2861                 if (curstate == DTRACESPEC_COMMITTINGMANY)
 2862                         break;
 2863 
 2864                 switch (curstate) {
 2865                 case DTRACESPEC_INACTIVE:
 2866                 case DTRACESPEC_DISCARDING:
 2867                         return;
 2868 
 2869                 case DTRACESPEC_COMMITTING:
 2870                         /*
 2871                          * This is only possible if we are (a) commit()'ing
 2872                          * without having done a prior speculate() on this CPU
 2873                          * and (b) racing with another commit() on a different
 2874                          * CPU.  There's nothing to do -- we just assert that
 2875                          * our offset is 0.
 2876                          */
 2877                         ASSERT(src->dtb_offset == 0);
 2878                         return;
 2879 
 2880                 case DTRACESPEC_ACTIVE:
 2881                         new = DTRACESPEC_COMMITTING;
 2882                         break;
 2883 
 2884                 case DTRACESPEC_ACTIVEONE:
 2885                         /*
 2886                          * This speculation is active on one CPU.  If our
 2887                          * buffer offset is non-zero, we know that the one CPU
 2888                          * must be us.  Otherwise, we are committing on a
 2889                          * different CPU from the speculate(), and we must
 2890                          * rely on being asynchronously cleaned.
 2891                          */
 2892                         if (src->dtb_offset != 0) {
 2893                                 new = DTRACESPEC_COMMITTING;
 2894                                 break;
 2895                         }
 2896                         /*FALLTHROUGH*/
 2897 
 2898                 case DTRACESPEC_ACTIVEMANY:
 2899                         new = DTRACESPEC_COMMITTINGMANY;
 2900                         break;
 2901 
 2902                 default:
 2903                         ASSERT(0);
 2904                 }
 2905         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
 2906             curstate, new) != curstate);
 2907 
 2908         /*
 2909          * We have set the state to indicate that we are committing this
 2910          * speculation.  Now reserve the necessary space in the destination
 2911          * buffer.
 2912          */
 2913         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
 2914             sizeof (uint64_t), state, NULL)) < 0) {
 2915                 dtrace_buffer_drop(dest);
 2916                 goto out;
 2917         }
 2918 
 2919         /*
 2920          * We have sufficient space to copy the speculative buffer into the
 2921          * primary buffer.  First, modify the speculative buffer, filling
 2922          * in the timestamp of all entries with the curstate time.  The data
 2923          * must have the commit() time rather than the time it was traced,
 2924          * so that all entries in the primary buffer are in timestamp order.
 2925          */
 2926         timestamp = dtrace_gethrtime();
 2927         saddr = (uintptr_t)src->dtb_tomax;
 2928         slimit = saddr + src->dtb_offset;
 2929         while (saddr < slimit) {
 2930                 size_t size;
 2931                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
 2932 
 2933                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
 2934                         saddr += sizeof (dtrace_epid_t);
 2935                         continue;
 2936                 }
 2937                 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
 2938                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
 2939 
 2940                 ASSERT3U(saddr + size, <=, slimit);
 2941                 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
 2942                 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
 2943 
 2944                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
 2945 
 2946                 saddr += size;
 2947         }
 2948 
 2949         /*
 2950          * Copy the buffer across.  (Note that this is a
 2951          * highly subobtimal bcopy(); in the unlikely event that this becomes
 2952          * a serious performance issue, a high-performance DTrace-specific
 2953          * bcopy() should obviously be invented.)
 2954          */
 2955         daddr = (uintptr_t)dest->dtb_tomax + offs;
 2956         dlimit = daddr + src->dtb_offset;
 2957         saddr = (uintptr_t)src->dtb_tomax;
 2958 
 2959         /*
 2960          * First, the aligned portion.
 2961          */
 2962         while (dlimit - daddr >= sizeof (uint64_t)) {
 2963                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
 2964 
 2965                 daddr += sizeof (uint64_t);
 2966                 saddr += sizeof (uint64_t);
 2967         }
 2968 
 2969         /*
 2970          * Now any left-over bit...
 2971          */
 2972         while (dlimit - daddr)
 2973                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
 2974 
 2975         /*
 2976          * Finally, commit the reserved space in the destination buffer.
 2977          */
 2978         dest->dtb_offset = offs + src->dtb_offset;
 2979 
 2980 out:
 2981         /*
 2982          * If we're lucky enough to be the only active CPU on this speculation
 2983          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
 2984          */
 2985         if (curstate == DTRACESPEC_ACTIVE ||
 2986             (curstate == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
 2987                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
 2988                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
 2989 
 2990                 ASSERT(rval == DTRACESPEC_COMMITTING);
 2991         }
 2992 
 2993         src->dtb_offset = 0;
 2994         src->dtb_xamot_drops += src->dtb_drops;
 2995         src->dtb_drops = 0;
 2996 }
 2997 
 2998 /*
 2999  * This routine discards an active speculation.  If the specified speculation
 3000  * is not in a valid state to perform a discard(), this routine will silently
 3001  * do nothing.  The state of the specified speculation is transitioned
 3002  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
 3003  */
 3004 static void
 3005 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
 3006     dtrace_specid_t which)
 3007 {
 3008         dtrace_speculation_t *spec;
 3009         dtrace_speculation_state_t curstate, new = 0;
 3010         dtrace_buffer_t *buf;
 3011 
 3012         if (which == 0)
 3013                 return;
 3014 
 3015         if (which > state->dts_nspeculations) {
 3016                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
 3017                 return;
 3018         }
 3019 
 3020         spec = &state->dts_speculations[which - 1];
 3021         buf = &spec->dtsp_buffer[cpu];
 3022 
 3023         do {
 3024                 curstate = spec->dtsp_state;
 3025 
 3026                 switch (curstate) {
 3027                 case DTRACESPEC_INACTIVE:
 3028                 case DTRACESPEC_COMMITTINGMANY:
 3029                 case DTRACESPEC_COMMITTING:
 3030                 case DTRACESPEC_DISCARDING:
 3031                         return;
 3032 
 3033                 case DTRACESPEC_ACTIVE:
 3034                 case DTRACESPEC_ACTIVEMANY:
 3035                         new = DTRACESPEC_DISCARDING;
 3036                         break;
 3037 
 3038                 case DTRACESPEC_ACTIVEONE:
 3039                         if (buf->dtb_offset != 0) {
 3040                                 new = DTRACESPEC_INACTIVE;
 3041                         } else {
 3042                                 new = DTRACESPEC_DISCARDING;
 3043                         }
 3044                         break;
 3045 
 3046                 default:
 3047                         ASSERT(0);
 3048                 }
 3049         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
 3050             curstate, new) != curstate);
 3051 
 3052         buf->dtb_offset = 0;
 3053         buf->dtb_drops = 0;
 3054 }
 3055 
 3056 /*
 3057  * Note:  not called from probe context.  This function is called
 3058  * asynchronously from cross call context to clean any speculations that are
 3059  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
 3060  * transitioned back to the INACTIVE state until all CPUs have cleaned the
 3061  * speculation.
 3062  */
 3063 static void
 3064 dtrace_speculation_clean_here(dtrace_state_t *state)
 3065 {
 3066         dtrace_icookie_t cookie;
 3067         processorid_t cpu = curcpu;
 3068         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
 3069         dtrace_specid_t i;
 3070 
 3071         cookie = dtrace_interrupt_disable();
 3072 
 3073         if (dest->dtb_tomax == NULL) {
 3074                 dtrace_interrupt_enable(cookie);
 3075                 return;
 3076         }
 3077 
 3078         for (i = 0; i < state->dts_nspeculations; i++) {
 3079                 dtrace_speculation_t *spec = &state->dts_speculations[i];
 3080                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
 3081 
 3082                 if (src->dtb_tomax == NULL)
 3083                         continue;
 3084 
 3085                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
 3086                         src->dtb_offset = 0;
 3087                         continue;
 3088                 }
 3089 
 3090                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
 3091                         continue;
 3092 
 3093                 if (src->dtb_offset == 0)
 3094                         continue;
 3095 
 3096                 dtrace_speculation_commit(state, cpu, i + 1);
 3097         }
 3098 
 3099         dtrace_interrupt_enable(cookie);
 3100 }
 3101 
 3102 /*
 3103  * Note:  not called from probe context.  This function is called
 3104  * asynchronously (and at a regular interval) to clean any speculations that
 3105  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
 3106  * is work to be done, it cross calls all CPUs to perform that work;
 3107  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
 3108  * INACTIVE state until they have been cleaned by all CPUs.
 3109  */
 3110 static void
 3111 dtrace_speculation_clean(dtrace_state_t *state)
 3112 {
 3113         int work = 0, rv;
 3114         dtrace_specid_t i;
 3115 
 3116         for (i = 0; i < state->dts_nspeculations; i++) {
 3117                 dtrace_speculation_t *spec = &state->dts_speculations[i];
 3118 
 3119                 ASSERT(!spec->dtsp_cleaning);
 3120 
 3121                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
 3122                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
 3123                         continue;
 3124 
 3125                 work++;
 3126                 spec->dtsp_cleaning = 1;
 3127         }
 3128 
 3129         if (!work)
 3130                 return;
 3131 
 3132         dtrace_xcall(DTRACE_CPUALL,
 3133             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
 3134 
 3135         /*
 3136          * We now know that all CPUs have committed or discarded their
 3137          * speculation buffers, as appropriate.  We can now set the state
 3138          * to inactive.
 3139          */
 3140         for (i = 0; i < state->dts_nspeculations; i++) {
 3141                 dtrace_speculation_t *spec = &state->dts_speculations[i];
 3142                 dtrace_speculation_state_t curstate, new;
 3143 
 3144                 if (!spec->dtsp_cleaning)
 3145                         continue;
 3146 
 3147                 curstate = spec->dtsp_state;
 3148                 ASSERT(curstate == DTRACESPEC_DISCARDING ||
 3149                     curstate == DTRACESPEC_COMMITTINGMANY);
 3150 
 3151                 new = DTRACESPEC_INACTIVE;
 3152 
 3153                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, curstate, new);
 3154                 ASSERT(rv == curstate);
 3155                 spec->dtsp_cleaning = 0;
 3156         }
 3157 }
 3158 
 3159 /*
 3160  * Called as part of a speculate() to get the speculative buffer associated
 3161  * with a given speculation.  Returns NULL if the specified speculation is not
 3162  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
 3163  * the active CPU is not the specified CPU -- the speculation will be
 3164  * atomically transitioned into the ACTIVEMANY state.
 3165  */
 3166 static dtrace_buffer_t *
 3167 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
 3168     dtrace_specid_t which)
 3169 {
 3170         dtrace_speculation_t *spec;
 3171         dtrace_speculation_state_t curstate, new = 0;
 3172         dtrace_buffer_t *buf;
 3173 
 3174         if (which == 0)
 3175                 return (NULL);
 3176 
 3177         if (which > state->dts_nspeculations) {
 3178                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
 3179                 return (NULL);
 3180         }
 3181 
 3182         spec = &state->dts_speculations[which - 1];
 3183         buf = &spec->dtsp_buffer[cpuid];
 3184 
 3185         do {
 3186                 curstate = spec->dtsp_state;
 3187 
 3188                 switch (curstate) {
 3189                 case DTRACESPEC_INACTIVE:
 3190                 case DTRACESPEC_COMMITTINGMANY:
 3191                 case DTRACESPEC_DISCARDING:
 3192                         return (NULL);
 3193 
 3194                 case DTRACESPEC_COMMITTING:
 3195                         ASSERT(buf->dtb_offset == 0);
 3196                         return (NULL);
 3197 
 3198                 case DTRACESPEC_ACTIVEONE:
 3199                         /*
 3200                          * This speculation is currently active on one CPU.
 3201                          * Check the offset in the buffer; if it's non-zero,
 3202                          * that CPU must be us (and we leave the state alone).
 3203                          * If it's zero, assume that we're starting on a new
 3204                          * CPU -- and change the state to indicate that the
 3205                          * speculation is active on more than one CPU.
 3206                          */
 3207                         if (buf->dtb_offset != 0)
 3208                                 return (buf);
 3209 
 3210                         new = DTRACESPEC_ACTIVEMANY;
 3211                         break;
 3212 
 3213                 case DTRACESPEC_ACTIVEMANY:
 3214                         return (buf);
 3215 
 3216                 case DTRACESPEC_ACTIVE:
 3217                         new = DTRACESPEC_ACTIVEONE;
 3218                         break;
 3219 
 3220                 default:
 3221                         ASSERT(0);
 3222                 }
 3223         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
 3224             curstate, new) != curstate);
 3225 
 3226         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
 3227         return (buf);
 3228 }
 3229 
 3230 /*
 3231  * Return a string.  In the event that the user lacks the privilege to access
 3232  * arbitrary kernel memory, we copy the string out to scratch memory so that we
 3233  * don't fail access checking.
 3234  *
 3235  * dtrace_dif_variable() uses this routine as a helper for various
 3236  * builtin values such as 'execname' and 'probefunc.'
 3237  */
 3238 uintptr_t
 3239 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
 3240     dtrace_mstate_t *mstate)
 3241 {
 3242         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 3243         uintptr_t ret;
 3244         size_t strsz;
 3245 
 3246         /*
 3247          * The easy case: this probe is allowed to read all of memory, so
 3248          * we can just return this as a vanilla pointer.
 3249          */
 3250         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 3251                 return (addr);
 3252 
 3253         /*
 3254          * This is the tougher case: we copy the string in question from
 3255          * kernel memory into scratch memory and return it that way: this
 3256          * ensures that we won't trip up when access checking tests the
 3257          * BYREF return value.
 3258          */
 3259         strsz = dtrace_strlen((char *)addr, size) + 1;
 3260 
 3261         if (mstate->dtms_scratch_ptr + strsz >
 3262             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
 3263                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 3264                 return (0);
 3265         }
 3266 
 3267         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
 3268             strsz);
 3269         ret = mstate->dtms_scratch_ptr;
 3270         mstate->dtms_scratch_ptr += strsz;
 3271         return (ret);
 3272 }
 3273 
 3274 /*
 3275  * Return a string from a memoy address which is known to have one or
 3276  * more concatenated, individually zero terminated, sub-strings.
 3277  * In the event that the user lacks the privilege to access
 3278  * arbitrary kernel memory, we copy the string out to scratch memory so that we
 3279  * don't fail access checking.
 3280  *
 3281  * dtrace_dif_variable() uses this routine as a helper for various
 3282  * builtin values such as 'execargs'.
 3283  */
 3284 static uintptr_t
 3285 dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
 3286     dtrace_mstate_t *mstate)
 3287 {
 3288         char *p;
 3289         size_t i;
 3290         uintptr_t ret;
 3291 
 3292         if (mstate->dtms_scratch_ptr + strsz >
 3293             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
 3294                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 3295                 return (0);
 3296         }
 3297 
 3298         dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
 3299             strsz);
 3300 
 3301         /* Replace sub-string termination characters with a space. */
 3302         for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
 3303             p++, i++)
 3304                 if (*p == '\0')
 3305                         *p = ' ';
 3306 
 3307         ret = mstate->dtms_scratch_ptr;
 3308         mstate->dtms_scratch_ptr += strsz;
 3309         return (ret);
 3310 }
 3311 
 3312 /*
 3313  * This function implements the DIF emulator's variable lookups.  The emulator
 3314  * passes a reserved variable identifier and optional built-in array index.
 3315  */
 3316 static uint64_t
 3317 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
 3318     uint64_t ndx)
 3319 {
 3320         /*
 3321          * If we're accessing one of the uncached arguments, we'll turn this
 3322          * into a reference in the args array.
 3323          */
 3324         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
 3325                 ndx = v - DIF_VAR_ARG0;
 3326                 v = DIF_VAR_ARGS;
 3327         }
 3328 
 3329         switch (v) {
 3330         case DIF_VAR_ARGS:
 3331                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
 3332                 if (ndx >= sizeof (mstate->dtms_arg) /
 3333                     sizeof (mstate->dtms_arg[0])) {
 3334                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
 3335                         dtrace_provider_t *pv;
 3336                         uint64_t val;
 3337 
 3338                         pv = mstate->dtms_probe->dtpr_provider;
 3339                         if (pv->dtpv_pops.dtps_getargval != NULL)
 3340                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
 3341                                     mstate->dtms_probe->dtpr_id,
 3342                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
 3343                         else
 3344                                 val = dtrace_getarg(ndx, aframes);
 3345 
 3346                         /*
 3347                          * This is regrettably required to keep the compiler
 3348                          * from tail-optimizing the call to dtrace_getarg().
 3349                          * The condition always evaluates to true, but the
 3350                          * compiler has no way of figuring that out a priori.
 3351                          * (None of this would be necessary if the compiler
 3352                          * could be relied upon to _always_ tail-optimize
 3353                          * the call to dtrace_getarg() -- but it can't.)
 3354                          */
 3355                         if (mstate->dtms_probe != NULL)
 3356                                 return (val);
 3357 
 3358                         ASSERT(0);
 3359                 }
 3360 
 3361                 return (mstate->dtms_arg[ndx]);
 3362 
 3363         case DIF_VAR_REGS:
 3364         case DIF_VAR_UREGS: {
 3365                 struct trapframe *tframe;
 3366 
 3367                 if (!dtrace_priv_proc(state))
 3368                         return (0);
 3369 
 3370                 if (v == DIF_VAR_REGS)
 3371                         tframe = curthread->t_dtrace_trapframe;
 3372                 else
 3373                         tframe = curthread->td_frame;
 3374 
 3375                 if (tframe == NULL) {
 3376                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
 3377                         cpu_core[curcpu].cpuc_dtrace_illval = 0;
 3378                         return (0);
 3379                 }
 3380 
 3381                 return (dtrace_getreg(tframe, ndx));
 3382         }
 3383 
 3384         case DIF_VAR_CURTHREAD:
 3385                 if (!dtrace_priv_proc(state))
 3386                         return (0);
 3387                 return ((uint64_t)(uintptr_t)curthread);
 3388 
 3389         case DIF_VAR_TIMESTAMP:
 3390                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
 3391                         mstate->dtms_timestamp = dtrace_gethrtime();
 3392                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
 3393                 }
 3394                 return (mstate->dtms_timestamp);
 3395 
 3396         case DIF_VAR_VTIMESTAMP:
 3397                 ASSERT(dtrace_vtime_references != 0);
 3398                 return (curthread->t_dtrace_vtime);
 3399 
 3400         case DIF_VAR_WALLTIMESTAMP:
 3401                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
 3402                         mstate->dtms_walltimestamp = dtrace_gethrestime();
 3403                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
 3404                 }
 3405                 return (mstate->dtms_walltimestamp);
 3406 
 3407 #ifdef illumos
 3408         case DIF_VAR_IPL:
 3409                 if (!dtrace_priv_kernel(state))
 3410                         return (0);
 3411                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
 3412                         mstate->dtms_ipl = dtrace_getipl();
 3413                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
 3414                 }
 3415                 return (mstate->dtms_ipl);
 3416 #endif
 3417 
 3418         case DIF_VAR_EPID:
 3419                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
 3420                 return (mstate->dtms_epid);
 3421 
 3422         case DIF_VAR_ID:
 3423                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
 3424                 return (mstate->dtms_probe->dtpr_id);
 3425 
 3426         case DIF_VAR_STACKDEPTH:
 3427                 if (!dtrace_priv_kernel(state))
 3428                         return (0);
 3429                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
 3430                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
 3431 
 3432                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
 3433                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
 3434                 }
 3435                 return (mstate->dtms_stackdepth);
 3436 
 3437         case DIF_VAR_USTACKDEPTH:
 3438                 if (!dtrace_priv_proc(state))
 3439                         return (0);
 3440                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
 3441                         /*
 3442                          * See comment in DIF_VAR_PID.
 3443                          */
 3444                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
 3445                             CPU_ON_INTR(CPU)) {
 3446                                 mstate->dtms_ustackdepth = 0;
 3447                         } else {
 3448                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 3449                                 mstate->dtms_ustackdepth =
 3450                                     dtrace_getustackdepth();
 3451                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 3452                         }
 3453                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
 3454                 }
 3455                 return (mstate->dtms_ustackdepth);
 3456 
 3457         case DIF_VAR_CALLER:
 3458                 if (!dtrace_priv_kernel(state))
 3459                         return (0);
 3460                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
 3461                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
 3462 
 3463                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
 3464                                 /*
 3465                                  * If this is an unanchored probe, we are
 3466                                  * required to go through the slow path:
 3467                                  * dtrace_caller() only guarantees correct
 3468                                  * results for anchored probes.
 3469                                  */
 3470                                 pc_t caller[2] = {0, 0};
 3471 
 3472                                 dtrace_getpcstack(caller, 2, aframes,
 3473                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
 3474                                 mstate->dtms_caller = caller[1];
 3475                         } else if ((mstate->dtms_caller =
 3476                             dtrace_caller(aframes)) == -1) {
 3477                                 /*
 3478                                  * We have failed to do this the quick way;
 3479                                  * we must resort to the slower approach of
 3480                                  * calling dtrace_getpcstack().
 3481                                  */
 3482                                 pc_t caller = 0;
 3483 
 3484                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
 3485                                 mstate->dtms_caller = caller;
 3486                         }
 3487 
 3488                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
 3489                 }
 3490                 return (mstate->dtms_caller);
 3491 
 3492         case DIF_VAR_UCALLER:
 3493                 if (!dtrace_priv_proc(state))
 3494                         return (0);
 3495 
 3496                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
 3497                         uint64_t ustack[3];
 3498 
 3499                         /*
 3500                          * dtrace_getupcstack() fills in the first uint64_t
 3501                          * with the current PID.  The second uint64_t will
 3502                          * be the program counter at user-level.  The third
 3503                          * uint64_t will contain the caller, which is what
 3504                          * we're after.
 3505                          */
 3506                         ustack[2] = 0;
 3507                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 3508                         dtrace_getupcstack(ustack, 3);
 3509                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 3510                         mstate->dtms_ucaller = ustack[2];
 3511                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
 3512                 }
 3513 
 3514                 return (mstate->dtms_ucaller);
 3515 
 3516         case DIF_VAR_PROBEPROV:
 3517                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
 3518                 return (dtrace_dif_varstr(
 3519                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
 3520                     state, mstate));
 3521 
 3522         case DIF_VAR_PROBEMOD:
 3523                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
 3524                 return (dtrace_dif_varstr(
 3525                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
 3526                     state, mstate));
 3527 
 3528         case DIF_VAR_PROBEFUNC:
 3529                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
 3530                 return (dtrace_dif_varstr(
 3531                     (uintptr_t)mstate->dtms_probe->dtpr_func,
 3532                     state, mstate));
 3533 
 3534         case DIF_VAR_PROBENAME:
 3535                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
 3536                 return (dtrace_dif_varstr(
 3537                     (uintptr_t)mstate->dtms_probe->dtpr_name,
 3538                     state, mstate));
 3539 
 3540         case DIF_VAR_PID:
 3541                 if (!dtrace_priv_proc(state))
 3542                         return (0);
 3543 
 3544 #ifdef illumos
 3545                 /*
 3546                  * Note that we are assuming that an unanchored probe is
 3547                  * always due to a high-level interrupt.  (And we're assuming
 3548                  * that there is only a single high level interrupt.)
 3549                  */
 3550                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3551                         return (pid0.pid_id);
 3552 
 3553                 /*
 3554                  * It is always safe to dereference one's own t_procp pointer:
 3555                  * it always points to a valid, allocated proc structure.
 3556                  * Further, it is always safe to dereference the p_pidp member
 3557                  * of one's own proc structure.  (These are truisms becuase
 3558                  * threads and processes don't clean up their own state --
 3559                  * they leave that task to whomever reaps them.)
 3560                  */
 3561                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
 3562 #else
 3563                 return ((uint64_t)curproc->p_pid);
 3564 #endif
 3565 
 3566         case DIF_VAR_PPID:
 3567                 if (!dtrace_priv_proc(state))
 3568                         return (0);
 3569 
 3570 #ifdef illumos
 3571                 /*
 3572                  * See comment in DIF_VAR_PID.
 3573                  */
 3574                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3575                         return (pid0.pid_id);
 3576 
 3577                 /*
 3578                  * It is always safe to dereference one's own t_procp pointer:
 3579                  * it always points to a valid, allocated proc structure.
 3580                  * (This is true because threads don't clean up their own
 3581                  * state -- they leave that task to whomever reaps them.)
 3582                  */
 3583                 return ((uint64_t)curthread->t_procp->p_ppid);
 3584 #else
 3585                 if (curproc->p_pid == proc0.p_pid)
 3586                         return (curproc->p_pid);
 3587                 else
 3588                         return (curproc->p_pptr->p_pid);
 3589 #endif
 3590 
 3591         case DIF_VAR_TID:
 3592 #ifdef illumos
 3593                 /*
 3594                  * See comment in DIF_VAR_PID.
 3595                  */
 3596                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3597                         return (0);
 3598 #endif
 3599 
 3600                 return ((uint64_t)curthread->t_tid);
 3601 
 3602         case DIF_VAR_EXECARGS: {
 3603                 struct pargs *p_args = curthread->td_proc->p_args;
 3604 
 3605                 if (p_args == NULL)
 3606                         return(0);
 3607 
 3608                 return (dtrace_dif_varstrz(
 3609                     (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
 3610         }
 3611 
 3612         case DIF_VAR_EXECNAME:
 3613 #ifdef illumos
 3614                 if (!dtrace_priv_proc(state))
 3615                         return (0);
 3616 
 3617                 /*
 3618                  * See comment in DIF_VAR_PID.
 3619                  */
 3620                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3621                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
 3622 
 3623                 /*
 3624                  * It is always safe to dereference one's own t_procp pointer:
 3625                  * it always points to a valid, allocated proc structure.
 3626                  * (This is true because threads don't clean up their own
 3627                  * state -- they leave that task to whomever reaps them.)
 3628                  */
 3629                 return (dtrace_dif_varstr(
 3630                     (uintptr_t)curthread->t_procp->p_user.u_comm,
 3631                     state, mstate));
 3632 #else
 3633                 return (dtrace_dif_varstr(
 3634                     (uintptr_t) curthread->td_proc->p_comm, state, mstate));
 3635 #endif
 3636 
 3637         case DIF_VAR_ZONENAME:
 3638 #ifdef illumos
 3639                 if (!dtrace_priv_proc(state))
 3640                         return (0);
 3641 
 3642                 /*
 3643                  * See comment in DIF_VAR_PID.
 3644                  */
 3645                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3646                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
 3647 
 3648                 /*
 3649                  * It is always safe to dereference one's own t_procp pointer:
 3650                  * it always points to a valid, allocated proc structure.
 3651                  * (This is true because threads don't clean up their own
 3652                  * state -- they leave that task to whomever reaps them.)
 3653                  */
 3654                 return (dtrace_dif_varstr(
 3655                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
 3656                     state, mstate));
 3657 #elif defined(__FreeBSD__)
 3658         /*
 3659          * On FreeBSD, we introduce compatibility to zonename by falling through
 3660          * into jailname.
 3661          */
 3662         case DIF_VAR_JAILNAME:
 3663                 if (!dtrace_priv_kernel(state))
 3664                         return (0);
 3665 
 3666                 return (dtrace_dif_varstr(
 3667                     (uintptr_t)curthread->td_ucred->cr_prison->pr_name,
 3668                     state, mstate));
 3669 
 3670         case DIF_VAR_JID:
 3671                 if (!dtrace_priv_kernel(state))
 3672                         return (0);
 3673 
 3674                 return ((uint64_t)curthread->td_ucred->cr_prison->pr_id);
 3675 #else
 3676                 return (0);
 3677 #endif
 3678 
 3679         case DIF_VAR_UID:
 3680                 if (!dtrace_priv_proc(state))
 3681                         return (0);
 3682 
 3683 #ifdef illumos
 3684                 /*
 3685                  * See comment in DIF_VAR_PID.
 3686                  */
 3687                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3688                         return ((uint64_t)p0.p_cred->cr_uid);
 3689 
 3690                 /*
 3691                  * It is always safe to dereference one's own t_procp pointer:
 3692                  * it always points to a valid, allocated proc structure.
 3693                  * (This is true because threads don't clean up their own
 3694                  * state -- they leave that task to whomever reaps them.)
 3695                  *
 3696                  * Additionally, it is safe to dereference one's own process
 3697                  * credential, since this is never NULL after process birth.
 3698                  */
 3699                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
 3700 #else
 3701                 return ((uint64_t)curthread->td_ucred->cr_uid);
 3702 #endif
 3703 
 3704         case DIF_VAR_GID:
 3705                 if (!dtrace_priv_proc(state))
 3706                         return (0);
 3707 
 3708 #ifdef illumos
 3709                 /*
 3710                  * See comment in DIF_VAR_PID.
 3711                  */
 3712                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3713                         return ((uint64_t)p0.p_cred->cr_gid);
 3714 
 3715                 /*
 3716                  * It is always safe to dereference one's own t_procp pointer:
 3717                  * it always points to a valid, allocated proc structure.
 3718                  * (This is true because threads don't clean up their own
 3719                  * state -- they leave that task to whomever reaps them.)
 3720                  *
 3721                  * Additionally, it is safe to dereference one's own process
 3722                  * credential, since this is never NULL after process birth.
 3723                  */
 3724                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
 3725 #else
 3726                 return ((uint64_t)curthread->td_ucred->cr_gid);
 3727 #endif
 3728 
 3729         case DIF_VAR_ERRNO: {
 3730 #ifdef illumos
 3731                 klwp_t *lwp;
 3732                 if (!dtrace_priv_proc(state))
 3733                         return (0);
 3734 
 3735                 /*
 3736                  * See comment in DIF_VAR_PID.
 3737                  */
 3738                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
 3739                         return (0);
 3740 
 3741                 /*
 3742                  * It is always safe to dereference one's own t_lwp pointer in
 3743                  * the event that this pointer is non-NULL.  (This is true
 3744                  * because threads and lwps don't clean up their own state --
 3745                  * they leave that task to whomever reaps them.)
 3746                  */
 3747                 if ((lwp = curthread->t_lwp) == NULL)
 3748                         return (0);
 3749 
 3750                 return ((uint64_t)lwp->lwp_errno);
 3751 #else
 3752                 return (curthread->td_errno);
 3753 #endif
 3754         }
 3755 #ifndef illumos
 3756         case DIF_VAR_CPU: {
 3757                 return curcpu;
 3758         }
 3759 #endif
 3760         default:
 3761                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 3762                 return (0);
 3763         }
 3764 }
 3765 
 3766 
 3767 typedef enum dtrace_json_state {
 3768         DTRACE_JSON_REST = 1,
 3769         DTRACE_JSON_OBJECT,
 3770         DTRACE_JSON_STRING,
 3771         DTRACE_JSON_STRING_ESCAPE,
 3772         DTRACE_JSON_STRING_ESCAPE_UNICODE,
 3773         DTRACE_JSON_COLON,
 3774         DTRACE_JSON_COMMA,
 3775         DTRACE_JSON_VALUE,
 3776         DTRACE_JSON_IDENTIFIER,
 3777         DTRACE_JSON_NUMBER,
 3778         DTRACE_JSON_NUMBER_FRAC,
 3779         DTRACE_JSON_NUMBER_EXP,
 3780         DTRACE_JSON_COLLECT_OBJECT
 3781 } dtrace_json_state_t;
 3782 
 3783 /*
 3784  * This function possesses just enough knowledge about JSON to extract a single
 3785  * value from a JSON string and store it in the scratch buffer.  It is able
 3786  * to extract nested object values, and members of arrays by index.
 3787  *
 3788  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
 3789  * be looked up as we descend into the object tree.  e.g.
 3790  *
 3791  *    foo[0].bar.baz[32] --> "foo" NUL "" NUL "bar" NUL "baz" NUL "32" NUL
 3792  *       with nelems = 5.
 3793  *
 3794  * The run time of this function must be bounded above by strsize to limit the
 3795  * amount of work done in probe context.  As such, it is implemented as a
 3796  * simple state machine, reading one character at a time using safe loads
 3797  * until we find the requested element, hit a parsing error or run off the
 3798  * end of the object or string.
 3799  *
 3800  * As there is no way for a subroutine to return an error without interrupting
 3801  * clause execution, we simply return NULL in the event of a missing key or any
 3802  * other error condition.  Each NULL return in this function is commented with
 3803  * the error condition it represents -- parsing or otherwise.
 3804  *
 3805  * The set of states for the state machine closely matches the JSON
 3806  * specification (http://json.org/).  Briefly:
 3807  *
 3808  *   DTRACE_JSON_REST:
 3809  *     Skip whitespace until we find either a top-level Object, moving
 3810  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
 3811  *
 3812  *   DTRACE_JSON_OBJECT:
 3813  *     Locate the next key String in an Object.  Sets a flag to denote
 3814  *     the next String as a key string and moves to DTRACE_JSON_STRING.
 3815  *
 3816  *   DTRACE_JSON_COLON:
 3817  *     Skip whitespace until we find the colon that separates key Strings
 3818  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
 3819  *
 3820  *   DTRACE_JSON_VALUE:
 3821  *     Detects the type of the next value (String, Number, Identifier, Object
 3822  *     or Array) and routes to the states that process that type.  Here we also
 3823  *     deal with the element selector list if we are requested to traverse down
 3824  *     into the object tree.
 3825  *
 3826  *   DTRACE_JSON_COMMA:
 3827  *     Skip whitespace until we find the comma that separates key-value pairs
 3828  *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
 3829  *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
 3830  *     states return to this state at the end of their value, unless otherwise
 3831  *     noted.
 3832  *
 3833  *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
 3834  *     Processes a Number literal from the JSON, including any exponent
 3835  *     component that may be present.  Numbers are returned as strings, which
 3836  *     may be passed to strtoll() if an integer is required.
 3837  *
 3838  *   DTRACE_JSON_IDENTIFIER:
 3839  *     Processes a "true", "false" or "null" literal in the JSON.
 3840  *
 3841  *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
 3842  *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
 3843  *     Processes a String literal from the JSON, whether the String denotes
 3844  *     a key, a value or part of a larger Object.  Handles all escape sequences
 3845  *     present in the specification, including four-digit unicode characters,
 3846  *     but merely includes the escape sequence without converting it to the
 3847  *     actual escaped character.  If the String is flagged as a key, we
 3848  *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
 3849  *
 3850  *   DTRACE_JSON_COLLECT_OBJECT:
 3851  *     This state collects an entire Object (or Array), correctly handling
 3852  *     embedded strings.  If the full element selector list matches this nested
 3853  *     object, we return the Object in full as a string.  If not, we use this
 3854  *     state to skip to the next value at this level and continue processing.
 3855  *
 3856  * NOTE: This function uses various macros from strtolctype.h to manipulate
 3857  * digit values, etc -- these have all been checked to ensure they make
 3858  * no additional function calls.
 3859  */
 3860 static char *
 3861 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
 3862     char *dest)
 3863 {
 3864         dtrace_json_state_t state = DTRACE_JSON_REST;
 3865         int64_t array_elem = INT64_MIN;
 3866         int64_t array_pos = 0;
 3867         uint8_t escape_unicount = 0;
 3868         boolean_t string_is_key = B_FALSE;
 3869         boolean_t collect_object = B_FALSE;
 3870         boolean_t found_key = B_FALSE;
 3871         boolean_t in_array = B_FALSE;
 3872         uint32_t braces = 0, brackets = 0;
 3873         char *elem = elemlist;
 3874         char *dd = dest;
 3875         uintptr_t cur;
 3876 
 3877         for (cur = json; cur < json + size; cur++) {
 3878                 char cc = dtrace_load8(cur);
 3879                 if (cc == '\0')
 3880                         return (NULL);
 3881 
 3882                 switch (state) {
 3883                 case DTRACE_JSON_REST:
 3884                         if (isspace(cc))
 3885                                 break;
 3886 
 3887                         if (cc == '{') {
 3888                                 state = DTRACE_JSON_OBJECT;
 3889                                 break;
 3890                         }
 3891 
 3892                         if (cc == '[') {
 3893                                 in_array = B_TRUE;
 3894                                 array_pos = 0;
 3895                                 array_elem = dtrace_strtoll(elem, 10, size);
 3896                                 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
 3897                                 state = DTRACE_JSON_VALUE;
 3898                                 break;
 3899                         }
 3900 
 3901                         /*
 3902                          * ERROR: expected to find a top-level object or array.
 3903                          */
 3904                         return (NULL);
 3905                 case DTRACE_JSON_OBJECT:
 3906                         if (isspace(cc))
 3907                                 break;
 3908 
 3909                         if (cc == '"') {
 3910                                 state = DTRACE_JSON_STRING;
 3911                                 string_is_key = B_TRUE;
 3912                                 break;
 3913                         }
 3914 
 3915                         /*
 3916                          * ERROR: either the object did not start with a key
 3917                          * string, or we've run off the end of the object
 3918                          * without finding the requested key.
 3919                          */
 3920                         return (NULL);
 3921                 case DTRACE_JSON_STRING:
 3922                         if (cc == '\\') {
 3923                                 *dd++ = '\\';
 3924                                 state = DTRACE_JSON_STRING_ESCAPE;
 3925                                 break;
 3926                         }
 3927 
 3928                         if (cc == '"') {
 3929                                 if (collect_object) {
 3930                                         /*
 3931                                          * We don't reset the dest here, as
 3932                                          * the string is part of a larger
 3933                                          * object being collected.
 3934                                          */
 3935                                         *dd++ = cc;
 3936                                         collect_object = B_FALSE;
 3937                                         state = DTRACE_JSON_COLLECT_OBJECT;
 3938                                         break;
 3939                                 }
 3940                                 *dd = '\0';
 3941                                 dd = dest; /* reset string buffer */
 3942                                 if (string_is_key) {
 3943                                         if (dtrace_strncmp(dest, elem,
 3944                                             size) == 0)
 3945                                                 found_key = B_TRUE;
 3946                                 } else if (found_key) {
 3947                                         if (nelems > 1) {
 3948                                                 /*
 3949                                                  * We expected an object, not
 3950                                                  * this string.
 3951                                                  */
 3952                                                 return (NULL);
 3953                                         }
 3954                                         return (dest);
 3955                                 }
 3956                                 state = string_is_key ? DTRACE_JSON_COLON :
 3957                                     DTRACE_JSON_COMMA;
 3958                                 string_is_key = B_FALSE;
 3959                                 break;
 3960                         }
 3961 
 3962                         *dd++ = cc;
 3963                         break;
 3964                 case DTRACE_JSON_STRING_ESCAPE:
 3965                         *dd++ = cc;
 3966                         if (cc == 'u') {
 3967                                 escape_unicount = 0;
 3968                                 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
 3969                         } else {
 3970                                 state = DTRACE_JSON_STRING;
 3971                         }
 3972                         break;
 3973                 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
 3974                         if (!isxdigit(cc)) {
 3975                                 /*
 3976                                  * ERROR: invalid unicode escape, expected
 3977                                  * four valid hexidecimal digits.
 3978                                  */
 3979                                 return (NULL);
 3980                         }
 3981 
 3982                         *dd++ = cc;
 3983                         if (++escape_unicount == 4)
 3984                                 state = DTRACE_JSON_STRING;
 3985                         break;
 3986                 case DTRACE_JSON_COLON:
 3987                         if (isspace(cc))
 3988                                 break;
 3989 
 3990                         if (cc == ':') {
 3991                                 state = DTRACE_JSON_VALUE;
 3992                                 break;
 3993                         }
 3994 
 3995                         /*
 3996                          * ERROR: expected a colon.
 3997                          */
 3998                         return (NULL);
 3999                 case DTRACE_JSON_COMMA:
 4000                         if (isspace(cc))
 4001                                 break;
 4002 
 4003                         if (cc == ',') {
 4004                                 if (in_array) {
 4005                                         state = DTRACE_JSON_VALUE;
 4006                                         if (++array_pos == array_elem)
 4007                                                 found_key = B_TRUE;
 4008                                 } else {
 4009                                         state = DTRACE_JSON_OBJECT;
 4010                                 }
 4011                                 break;
 4012                         }
 4013 
 4014                         /*
 4015                          * ERROR: either we hit an unexpected character, or
 4016                          * we reached the end of the object or array without
 4017                          * finding the requested key.
 4018                          */
 4019                         return (NULL);
 4020                 case DTRACE_JSON_IDENTIFIER:
 4021                         if (islower(cc)) {
 4022                                 *dd++ = cc;
 4023                                 break;
 4024                         }
 4025 
 4026                         *dd = '\0';
 4027                         dd = dest; /* reset string buffer */
 4028 
 4029                         if (dtrace_strncmp(dest, "true", 5) == 0 ||
 4030                             dtrace_strncmp(dest, "false", 6) == 0 ||
 4031                             dtrace_strncmp(dest, "null", 5) == 0) {
 4032                                 if (found_key) {
 4033                                         if (nelems > 1) {
 4034                                                 /*
 4035                                                  * ERROR: We expected an object,
 4036                                                  * not this identifier.
 4037                                                  */
 4038                                                 return (NULL);
 4039                                         }
 4040                                         return (dest);
 4041                                 } else {
 4042                                         cur--;
 4043                                         state = DTRACE_JSON_COMMA;
 4044                                         break;
 4045                                 }
 4046                         }
 4047 
 4048                         /*
 4049                          * ERROR: we did not recognise the identifier as one
 4050                          * of those in the JSON specification.
 4051                          */
 4052                         return (NULL);
 4053                 case DTRACE_JSON_NUMBER:
 4054                         if (cc == '.') {
 4055                                 *dd++ = cc;
 4056                                 state = DTRACE_JSON_NUMBER_FRAC;
 4057                                 break;
 4058                         }
 4059 
 4060                         if (cc == 'x' || cc == 'X') {
 4061                                 /*
 4062                                  * ERROR: specification explicitly excludes
 4063                                  * hexidecimal or octal numbers.
 4064                                  */
 4065                                 return (NULL);
 4066                         }
 4067 
 4068                         /* FALLTHRU */
 4069                 case DTRACE_JSON_NUMBER_FRAC:
 4070                         if (cc == 'e' || cc == 'E') {
 4071                                 *dd++ = cc;
 4072                                 state = DTRACE_JSON_NUMBER_EXP;
 4073                                 break;
 4074                         }
 4075 
 4076                         if (cc == '+' || cc == '-') {
 4077                                 /*
 4078                                  * ERROR: expect sign as part of exponent only.
 4079                                  */
 4080                                 return (NULL);
 4081                         }
 4082                         /* FALLTHRU */
 4083                 case DTRACE_JSON_NUMBER_EXP:
 4084                         if (isdigit(cc) || cc == '+' || cc == '-') {
 4085                                 *dd++ = cc;
 4086                                 break;
 4087                         }
 4088 
 4089                         *dd = '\0';
 4090                         dd = dest; /* reset string buffer */
 4091                         if (found_key) {
 4092                                 if (nelems > 1) {
 4093                                         /*
 4094                                          * ERROR: We expected an object, not
 4095                                          * this number.
 4096                                          */
 4097                                         return (NULL);
 4098                                 }
 4099                                 return (dest);
 4100                         }
 4101 
 4102                         cur--;
 4103                         state = DTRACE_JSON_COMMA;
 4104                         break;
 4105                 case DTRACE_JSON_VALUE:
 4106                         if (isspace(cc))
 4107                                 break;
 4108 
 4109                         if (cc == '{' || cc == '[') {
 4110                                 if (nelems > 1 && found_key) {
 4111                                         in_array = cc == '[' ? B_TRUE : B_FALSE;
 4112                                         /*
 4113                                          * If our element selector directs us
 4114                                          * to descend into this nested object,
 4115                                          * then move to the next selector
 4116                                          * element in the list and restart the
 4117                                          * state machine.
 4118                                          */
 4119                                         while (*elem != '\0')
 4120                                                 elem++;
 4121                                         elem++; /* skip the inter-element NUL */
 4122                                         nelems--;
 4123                                         dd = dest;
 4124                                         if (in_array) {
 4125                                                 state = DTRACE_JSON_VALUE;
 4126                                                 array_pos = 0;
 4127                                                 array_elem = dtrace_strtoll(
 4128                                                     elem, 10, size);
 4129                                                 found_key = array_elem == 0 ?
 4130                                                     B_TRUE : B_FALSE;
 4131                                         } else {
 4132                                                 found_key = B_FALSE;
 4133                                                 state = DTRACE_JSON_OBJECT;
 4134                                         }
 4135                                         break;
 4136                                 }
 4137 
 4138                                 /*
 4139                                  * Otherwise, we wish to either skip this
 4140                                  * nested object or return it in full.
 4141                                  */
 4142                                 if (cc == '[')
 4143                                         brackets = 1;
 4144                                 else
 4145                                         braces = 1;
 4146                                 *dd++ = cc;
 4147                                 state = DTRACE_JSON_COLLECT_OBJECT;
 4148                                 break;
 4149                         }
 4150 
 4151                         if (cc == '"') {
 4152                                 state = DTRACE_JSON_STRING;
 4153                                 break;
 4154                         }
 4155 
 4156                         if (islower(cc)) {
 4157                                 /*
 4158                                  * Here we deal with true, false and null.
 4159                                  */
 4160                                 *dd++ = cc;
 4161                                 state = DTRACE_JSON_IDENTIFIER;
 4162                                 break;
 4163                         }
 4164 
 4165                         if (cc == '-' || isdigit(cc)) {
 4166                                 *dd++ = cc;
 4167                                 state = DTRACE_JSON_NUMBER;
 4168                                 break;
 4169                         }
 4170 
 4171                         /*
 4172                          * ERROR: unexpected character at start of value.
 4173                          */
 4174                         return (NULL);
 4175                 case DTRACE_JSON_COLLECT_OBJECT:
 4176                         if (cc == '\0')
 4177                                 /*
 4178                                  * ERROR: unexpected end of input.
 4179                                  */
 4180                                 return (NULL);
 4181 
 4182                         *dd++ = cc;
 4183                         if (cc == '"') {
 4184                                 collect_object = B_TRUE;
 4185                                 state = DTRACE_JSON_STRING;
 4186                                 break;
 4187                         }
 4188 
 4189                         if (cc == ']') {
 4190                                 if (brackets-- == 0) {
 4191                                         /*
 4192                                          * ERROR: unbalanced brackets.
 4193                                          */
 4194                                         return (NULL);
 4195                                 }
 4196                         } else if (cc == '}') {
 4197                                 if (braces-- == 0) {
 4198                                         /*
 4199                                          * ERROR: unbalanced braces.
 4200                                          */
 4201                                         return (NULL);
 4202                                 }
 4203                         } else if (cc == '{') {
 4204                                 braces++;
 4205                         } else if (cc == '[') {
 4206                                 brackets++;
 4207                         }
 4208 
 4209                         if (brackets == 0 && braces == 0) {
 4210                                 if (found_key) {
 4211                                         *dd = '\0';
 4212                                         return (dest);
 4213                                 }
 4214                                 dd = dest; /* reset string buffer */
 4215                                 state = DTRACE_JSON_COMMA;
 4216                         }
 4217                         break;
 4218                 }
 4219         }
 4220         return (NULL);
 4221 }
 4222 
 4223 /*
 4224  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
 4225  * Notice that we don't bother validating the proper number of arguments or
 4226  * their types in the tuple stack.  This isn't needed because all argument
 4227  * interpretation is safe because of our load safety -- the worst that can
 4228  * happen is that a bogus program can obtain bogus results.
 4229  */
 4230 static void
 4231 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
 4232     dtrace_key_t *tupregs, int nargs,
 4233     dtrace_mstate_t *mstate, dtrace_state_t *state)
 4234 {
 4235         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
 4236         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
 4237         dtrace_vstate_t *vstate = &state->dts_vstate;
 4238 
 4239 #ifdef illumos
 4240         union {
 4241                 mutex_impl_t mi;
 4242                 uint64_t mx;
 4243         } m;
 4244 
 4245         union {
 4246                 krwlock_t ri;
 4247                 uintptr_t rw;
 4248         } r;
 4249 #else
 4250         struct thread *lowner;
 4251         union {
 4252                 struct lock_object *li;
 4253                 uintptr_t lx;
 4254         } l;
 4255 #endif
 4256 
 4257         switch (subr) {
 4258         case DIF_SUBR_RAND:
 4259                 regs[rd] = dtrace_xoroshiro128_plus_next(
 4260                     state->dts_rstate[curcpu]);
 4261                 break;
 4262 
 4263 #ifdef illumos
 4264         case DIF_SUBR_MUTEX_OWNED:
 4265                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 4266                     mstate, vstate)) {
 4267                         regs[rd] = 0;
 4268                         break;
 4269                 }
 4270 
 4271                 m.mx = dtrace_load64(tupregs[0].dttk_value);
 4272                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
 4273                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
 4274                 else
 4275                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
 4276                 break;
 4277 
 4278         case DIF_SUBR_MUTEX_OWNER:
 4279                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 4280                     mstate, vstate)) {
 4281                         regs[rd] = 0;
 4282                         break;
 4283                 }
 4284 
 4285                 m.mx = dtrace_load64(tupregs[0].dttk_value);
 4286                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
 4287                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
 4288                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
 4289                 else
 4290                         regs[rd] = 0;
 4291                 break;
 4292 
 4293         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
 4294                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 4295                     mstate, vstate)) {
 4296                         regs[rd] = 0;
 4297                         break;
 4298                 }
 4299 
 4300                 m.mx = dtrace_load64(tupregs[0].dttk_value);
 4301                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
 4302                 break;
 4303 
 4304         case DIF_SUBR_MUTEX_TYPE_SPIN:
 4305                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
 4306                     mstate, vstate)) {
 4307                         regs[rd] = 0;
 4308                         break;
 4309                 }
 4310 
 4311                 m.mx = dtrace_load64(tupregs[0].dttk_value);
 4312                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
 4313                 break;
 4314 
 4315         case DIF_SUBR_RW_READ_HELD: {
 4316                 uintptr_t tmp;
 4317 
 4318                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
 4319                     mstate, vstate)) {
 4320                         regs[rd] = 0;
 4321                         break;
 4322                 }
 4323 
 4324                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
 4325                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
 4326                 break;
 4327         }
 4328 
 4329         case DIF_SUBR_RW_WRITE_HELD:
 4330                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
 4331                     mstate, vstate)) {
 4332                         regs[rd] = 0;
 4333                         break;
 4334                 }
 4335 
 4336                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
 4337                 regs[rd] = _RW_WRITE_HELD(&r.ri);
 4338                 break;
 4339 
 4340         case DIF_SUBR_RW_ISWRITER:
 4341                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
 4342                     mstate, vstate)) {
 4343                         regs[rd] = 0;
 4344                         break;
 4345                 }
 4346 
 4347                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
 4348                 regs[rd] = _RW_ISWRITER(&r.ri);
 4349                 break;
 4350 
 4351 #else /* !illumos */
 4352         case DIF_SUBR_MUTEX_OWNED:
 4353                 if (!dtrace_canload(tupregs[0].dttk_value,
 4354                         sizeof (struct lock_object), mstate, vstate)) {
 4355                         regs[rd] = 0;
 4356                         break;
 4357                 }
 4358                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
 4359                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4360                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
 4361                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4362                 break;
 4363 
 4364         case DIF_SUBR_MUTEX_OWNER:
 4365                 if (!dtrace_canload(tupregs[0].dttk_value,
 4366                         sizeof (struct lock_object), mstate, vstate)) {
 4367                         regs[rd] = 0;
 4368                         break;
 4369                 }
 4370                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
 4371                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4372                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
 4373                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4374                 regs[rd] = (uintptr_t)lowner;
 4375                 break;
 4376 
 4377         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
 4378                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
 4379                     mstate, vstate)) {
 4380                         regs[rd] = 0;
 4381                         break;
 4382                 }
 4383                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
 4384                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4385                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SLEEPLOCK) != 0;
 4386                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4387                 break;
 4388 
 4389         case DIF_SUBR_MUTEX_TYPE_SPIN:
 4390                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
 4391                     mstate, vstate)) {
 4392                         regs[rd] = 0;
 4393                         break;
 4394                 }
 4395                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
 4396                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4397                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
 4398                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4399                 break;
 4400 
 4401         case DIF_SUBR_RW_READ_HELD: 
 4402         case DIF_SUBR_SX_SHARED_HELD: 
 4403                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
 4404                     mstate, vstate)) {
 4405                         regs[rd] = 0;
 4406                         break;
 4407                 }
 4408                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
 4409                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4410                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
 4411                     lowner == NULL;
 4412                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4413                 break;
 4414 
 4415         case DIF_SUBR_RW_WRITE_HELD:
 4416         case DIF_SUBR_SX_EXCLUSIVE_HELD:
 4417                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
 4418                     mstate, vstate)) {
 4419                         regs[rd] = 0;
 4420                         break;
 4421                 }
 4422                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
 4423                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4424                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
 4425                     lowner != NULL;
 4426                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4427                 break;
 4428 
 4429         case DIF_SUBR_RW_ISWRITER:
 4430         case DIF_SUBR_SX_ISEXCLUSIVE:
 4431                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
 4432                     mstate, vstate)) {
 4433                         regs[rd] = 0;
 4434                         break;
 4435                 }
 4436                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
 4437                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4438                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
 4439                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4440                 regs[rd] = (lowner == curthread);
 4441                 break;
 4442 #endif /* illumos */
 4443 
 4444         case DIF_SUBR_BCOPY: {
 4445                 /*
 4446                  * We need to be sure that the destination is in the scratch
 4447                  * region -- no other region is allowed.
 4448                  */
 4449                 uintptr_t src = tupregs[0].dttk_value;
 4450                 uintptr_t dest = tupregs[1].dttk_value;
 4451                 size_t size = tupregs[2].dttk_value;
 4452 
 4453                 if (!dtrace_inscratch(dest, size, mstate)) {
 4454                         *flags |= CPU_DTRACE_BADADDR;
 4455                         *illval = regs[rd];
 4456                         break;
 4457                 }
 4458 
 4459                 if (!dtrace_canload(src, size, mstate, vstate)) {
 4460                         regs[rd] = 0;
 4461                         break;
 4462                 }
 4463 
 4464                 dtrace_bcopy((void *)src, (void *)dest, size);
 4465                 break;
 4466         }
 4467 
 4468         case DIF_SUBR_ALLOCA:
 4469         case DIF_SUBR_COPYIN: {
 4470                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
 4471                 uint64_t size =
 4472                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
 4473                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
 4474 
 4475                 /*
 4476                  * This action doesn't require any credential checks since
 4477                  * probes will not activate in user contexts to which the
 4478                  * enabling user does not have permissions.
 4479                  */
 4480 
 4481                 /*
 4482                  * Rounding up the user allocation size could have overflowed
 4483                  * a large, bogus allocation (like -1ULL) to 0.
 4484                  */
 4485                 if (scratch_size < size ||
 4486                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
 4487                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 4488                         regs[rd] = 0;
 4489                         break;
 4490                 }
 4491 
 4492                 if (subr == DIF_SUBR_COPYIN) {
 4493                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4494                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
 4495                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4496                 }
 4497 
 4498                 mstate->dtms_scratch_ptr += scratch_size;
 4499                 regs[rd] = dest;
 4500                 break;
 4501         }
 4502 
 4503         case DIF_SUBR_COPYINTO: {
 4504                 uint64_t size = tupregs[1].dttk_value;
 4505                 uintptr_t dest = tupregs[2].dttk_value;
 4506 
 4507                 /*
 4508                  * This action doesn't require any credential checks since
 4509                  * probes will not activate in user contexts to which the
 4510                  * enabling user does not have permissions.
 4511                  */
 4512                 if (!dtrace_inscratch(dest, size, mstate)) {
 4513                         *flags |= CPU_DTRACE_BADADDR;
 4514                         *illval = regs[rd];
 4515                         break;
 4516                 }
 4517 
 4518                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4519                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
 4520                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4521                 break;
 4522         }
 4523 
 4524         case DIF_SUBR_COPYINSTR: {
 4525                 uintptr_t dest = mstate->dtms_scratch_ptr;
 4526                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4527 
 4528                 if (nargs > 1 && tupregs[1].dttk_value < size)
 4529                         size = tupregs[1].dttk_value + 1;
 4530 
 4531                 /*
 4532                  * This action doesn't require any credential checks since
 4533                  * probes will not activate in user contexts to which the
 4534                  * enabling user does not have permissions.
 4535                  */
 4536                 if (!DTRACE_INSCRATCH(mstate, size)) {
 4537                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 4538                         regs[rd] = 0;
 4539                         break;
 4540                 }
 4541 
 4542                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4543                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
 4544                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4545 
 4546                 ((char *)dest)[size - 1] = '\0';
 4547                 mstate->dtms_scratch_ptr += size;
 4548                 regs[rd] = dest;
 4549                 break;
 4550         }
 4551 
 4552 #ifdef illumos
 4553         case DIF_SUBR_MSGSIZE:
 4554         case DIF_SUBR_MSGDSIZE: {
 4555                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
 4556                 uintptr_t wptr, rptr;
 4557                 size_t count = 0;
 4558                 int cont = 0;
 4559 
 4560                 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
 4561 
 4562                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
 4563                             vstate)) {
 4564                                 regs[rd] = 0;
 4565                                 break;
 4566                         }
 4567 
 4568                         wptr = dtrace_loadptr(baddr +
 4569                             offsetof(mblk_t, b_wptr));
 4570 
 4571                         rptr = dtrace_loadptr(baddr +
 4572                             offsetof(mblk_t, b_rptr));
 4573 
 4574                         if (wptr < rptr) {
 4575                                 *flags |= CPU_DTRACE_BADADDR;
 4576                                 *illval = tupregs[0].dttk_value;
 4577                                 break;
 4578                         }
 4579 
 4580                         daddr = dtrace_loadptr(baddr +
 4581                             offsetof(mblk_t, b_datap));
 4582 
 4583                         baddr = dtrace_loadptr(baddr +
 4584                             offsetof(mblk_t, b_cont));
 4585 
 4586                         /*
 4587                          * We want to prevent against denial-of-service here,
 4588                          * so we're only going to search the list for
 4589                          * dtrace_msgdsize_max mblks.
 4590                          */
 4591                         if (cont++ > dtrace_msgdsize_max) {
 4592                                 *flags |= CPU_DTRACE_ILLOP;
 4593                                 break;
 4594                         }
 4595 
 4596                         if (subr == DIF_SUBR_MSGDSIZE) {
 4597                                 if (dtrace_load8(daddr +
 4598                                     offsetof(dblk_t, db_type)) != M_DATA)
 4599                                         continue;
 4600                         }
 4601 
 4602                         count += wptr - rptr;
 4603                 }
 4604 
 4605                 if (!(*flags & CPU_DTRACE_FAULT))
 4606                         regs[rd] = count;
 4607 
 4608                 break;
 4609         }
 4610 #endif
 4611 
 4612         case DIF_SUBR_PROGENYOF: {
 4613                 pid_t pid = tupregs[0].dttk_value;
 4614                 proc_t *p;
 4615                 int rval = 0;
 4616 
 4617                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4618 
 4619                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
 4620 #ifdef illumos
 4621                         if (p->p_pidp->pid_id == pid) {
 4622 #else
 4623                         if (p->p_pid == pid) {
 4624 #endif
 4625                                 rval = 1;
 4626                                 break;
 4627                         }
 4628                 }
 4629 
 4630                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4631 
 4632                 regs[rd] = rval;
 4633                 break;
 4634         }
 4635 
 4636         case DIF_SUBR_SPECULATION:
 4637                 regs[rd] = dtrace_speculation(state);
 4638                 break;
 4639 
 4640         case DIF_SUBR_COPYOUT: {
 4641                 uintptr_t kaddr = tupregs[0].dttk_value;
 4642                 uintptr_t uaddr = tupregs[1].dttk_value;
 4643                 uint64_t size = tupregs[2].dttk_value;
 4644 
 4645                 if (!dtrace_destructive_disallow &&
 4646                     dtrace_priv_proc_control(state) &&
 4647                     !dtrace_istoxic(kaddr, size) &&
 4648                     dtrace_canload(kaddr, size, mstate, vstate)) {
 4649                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4650                         dtrace_copyout(kaddr, uaddr, size, flags);
 4651                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4652                 }
 4653                 break;
 4654         }
 4655 
 4656         case DIF_SUBR_COPYOUTSTR: {
 4657                 uintptr_t kaddr = tupregs[0].dttk_value;
 4658                 uintptr_t uaddr = tupregs[1].dttk_value;
 4659                 uint64_t size = tupregs[2].dttk_value;
 4660                 size_t lim;
 4661 
 4662                 if (!dtrace_destructive_disallow &&
 4663                     dtrace_priv_proc_control(state) &&
 4664                     !dtrace_istoxic(kaddr, size) &&
 4665                     dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
 4666                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 4667                         dtrace_copyoutstr(kaddr, uaddr, lim, flags);
 4668                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 4669                 }
 4670                 break;
 4671         }
 4672 
 4673         case DIF_SUBR_STRLEN: {
 4674                 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4675                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
 4676                 size_t lim;
 4677 
 4678                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
 4679                         regs[rd] = 0;
 4680                         break;
 4681                 }
 4682 
 4683                 regs[rd] = dtrace_strlen((char *)addr, lim);
 4684                 break;
 4685         }
 4686 
 4687         case DIF_SUBR_STRCHR:
 4688         case DIF_SUBR_STRRCHR: {
 4689                 /*
 4690                  * We're going to iterate over the string looking for the
 4691                  * specified character.  We will iterate until we have reached
 4692                  * the string length or we have found the character.  If this
 4693                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
 4694                  * of the specified character instead of the first.
 4695                  */
 4696                 uintptr_t addr = tupregs[0].dttk_value;
 4697                 uintptr_t addr_limit;
 4698                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4699                 size_t lim;
 4700                 char c, target = (char)tupregs[1].dttk_value;
 4701 
 4702                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
 4703                         regs[rd] = 0;
 4704                         break;
 4705                 }
 4706                 addr_limit = addr + lim;
 4707 
 4708                 for (regs[rd] = 0; addr < addr_limit; addr++) {
 4709                         if ((c = dtrace_load8(addr)) == target) {
 4710                                 regs[rd] = addr;
 4711 
 4712                                 if (subr == DIF_SUBR_STRCHR)
 4713                                         break;
 4714                         }
 4715 
 4716                         if (c == '\0')
 4717                                 break;
 4718                 }
 4719                 break;
 4720         }
 4721 
 4722         case DIF_SUBR_STRSTR:
 4723         case DIF_SUBR_INDEX:
 4724         case DIF_SUBR_RINDEX: {
 4725                 /*
 4726                  * We're going to iterate over the string looking for the
 4727                  * specified string.  We will iterate until we have reached
 4728                  * the string length or we have found the string.  (Yes, this
 4729                  * is done in the most naive way possible -- but considering
 4730                  * that the string we're searching for is likely to be
 4731                  * relatively short, the complexity of Rabin-Karp or similar
 4732                  * hardly seems merited.)
 4733                  */
 4734                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
 4735                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
 4736                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4737                 size_t len = dtrace_strlen(addr, size);
 4738                 size_t sublen = dtrace_strlen(substr, size);
 4739                 char *limit = addr + len, *orig = addr;
 4740                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
 4741                 int inc = 1;
 4742 
 4743                 regs[rd] = notfound;
 4744 
 4745                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
 4746                         regs[rd] = 0;
 4747                         break;
 4748                 }
 4749 
 4750                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
 4751                     vstate)) {
 4752                         regs[rd] = 0;
 4753                         break;
 4754                 }
 4755 
 4756                 /*
 4757                  * strstr() and index()/rindex() have similar semantics if
 4758                  * both strings are the empty string: strstr() returns a
 4759                  * pointer to the (empty) string, and index() and rindex()
 4760                  * both return index 0 (regardless of any position argument).
 4761                  */
 4762                 if (sublen == 0 && len == 0) {
 4763                         if (subr == DIF_SUBR_STRSTR)
 4764                                 regs[rd] = (uintptr_t)addr;
 4765                         else
 4766                                 regs[rd] = 0;
 4767                         break;
 4768                 }
 4769 
 4770                 if (subr != DIF_SUBR_STRSTR) {
 4771                         if (subr == DIF_SUBR_RINDEX) {
 4772                                 limit = orig - 1;
 4773                                 addr += len;
 4774                                 inc = -1;
 4775                         }
 4776 
 4777                         /*
 4778                          * Both index() and rindex() take an optional position
 4779                          * argument that denotes the starting position.
 4780                          */
 4781                         if (nargs == 3) {
 4782                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
 4783 
 4784                                 /*
 4785                                  * If the position argument to index() is
 4786                                  * negative, Perl implicitly clamps it at
 4787                                  * zero.  This semantic is a little surprising
 4788                                  * given the special meaning of negative
 4789                                  * positions to similar Perl functions like
 4790                                  * substr(), but it appears to reflect a
 4791                                  * notion that index() can start from a
 4792                                  * negative index and increment its way up to
 4793                                  * the string.  Given this notion, Perl's
 4794                                  * rindex() is at least self-consistent in
 4795                                  * that it implicitly clamps positions greater
 4796                                  * than the string length to be the string
 4797                                  * length.  Where Perl completely loses
 4798                                  * coherence, however, is when the specified
 4799                                  * substring is the empty string ("").  In
 4800                                  * this case, even if the position is
 4801                                  * negative, rindex() returns 0 -- and even if
 4802                                  * the position is greater than the length,
 4803                                  * index() returns the string length.  These
 4804                                  * semantics violate the notion that index()
 4805                                  * should never return a value less than the
 4806                                  * specified position and that rindex() should
 4807                                  * never return a value greater than the
 4808                                  * specified position.  (One assumes that
 4809                                  * these semantics are artifacts of Perl's
 4810                                  * implementation and not the results of
 4811                                  * deliberate design -- it beggars belief that
 4812                                  * even Larry Wall could desire such oddness.)
 4813                                  * While in the abstract one would wish for
 4814                                  * consistent position semantics across
 4815                                  * substr(), index() and rindex() -- or at the
 4816                                  * very least self-consistent position
 4817                                  * semantics for index() and rindex() -- we
 4818                                  * instead opt to keep with the extant Perl
 4819                                  * semantics, in all their broken glory.  (Do
 4820                                  * we have more desire to maintain Perl's
 4821                                  * semantics than Perl does?  Probably.)
 4822                                  */
 4823                                 if (subr == DIF_SUBR_RINDEX) {
 4824                                         if (pos < 0) {
 4825                                                 if (sublen == 0)
 4826                                                         regs[rd] = 0;
 4827                                                 break;
 4828                                         }
 4829 
 4830                                         if (pos > len)
 4831                                                 pos = len;
 4832                                 } else {
 4833                                         if (pos < 0)
 4834                                                 pos = 0;
 4835 
 4836                                         if (pos >= len) {
 4837                                                 if (sublen == 0)
 4838                                                         regs[rd] = len;
 4839                                                 break;
 4840                                         }
 4841                                 }
 4842 
 4843                                 addr = orig + pos;
 4844                         }
 4845                 }
 4846 
 4847                 for (regs[rd] = notfound; addr != limit; addr += inc) {
 4848                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
 4849                                 if (subr != DIF_SUBR_STRSTR) {
 4850                                         /*
 4851                                          * As D index() and rindex() are
 4852                                          * modeled on Perl (and not on awk),
 4853                                          * we return a zero-based (and not a
 4854                                          * one-based) index.  (For you Perl
 4855                                          * weenies: no, we're not going to add
 4856                                          * $[ -- and shouldn't you be at a con
 4857                                          * or something?)
 4858                                          */
 4859                                         regs[rd] = (uintptr_t)(addr - orig);
 4860                                         break;
 4861                                 }
 4862 
 4863                                 ASSERT(subr == DIF_SUBR_STRSTR);
 4864                                 regs[rd] = (uintptr_t)addr;
 4865                                 break;
 4866                         }
 4867                 }
 4868 
 4869                 break;
 4870         }
 4871 
 4872         case DIF_SUBR_STRTOK: {
 4873                 uintptr_t addr = tupregs[0].dttk_value;
 4874                 uintptr_t tokaddr = tupregs[1].dttk_value;
 4875                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4876                 uintptr_t limit, toklimit;
 4877                 size_t clim;
 4878                 uint8_t c = 0, tokmap[32];       /* 256 / 8 */
 4879                 char *dest = (char *)mstate->dtms_scratch_ptr;
 4880                 int i;
 4881 
 4882                 /*
 4883                  * Check both the token buffer and (later) the input buffer,
 4884                  * since both could be non-scratch addresses.
 4885                  */
 4886                 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
 4887                         regs[rd] = 0;
 4888                         break;
 4889                 }
 4890                 toklimit = tokaddr + clim;
 4891 
 4892                 if (!DTRACE_INSCRATCH(mstate, size)) {
 4893                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 4894                         regs[rd] = 0;
 4895                         break;
 4896                 }
 4897 
 4898                 if (addr == 0) {
 4899                         /*
 4900                          * If the address specified is NULL, we use our saved
 4901                          * strtok pointer from the mstate.  Note that this
 4902                          * means that the saved strtok pointer is _only_
 4903                          * valid within multiple enablings of the same probe --
 4904                          * it behaves like an implicit clause-local variable.
 4905                          */
 4906                         addr = mstate->dtms_strtok;
 4907                         limit = mstate->dtms_strtok_limit;
 4908                 } else {
 4909                         /*
 4910                          * If the user-specified address is non-NULL we must
 4911                          * access check it.  This is the only time we have
 4912                          * a chance to do so, since this address may reside
 4913                          * in the string table of this clause-- future calls
 4914                          * (when we fetch addr from mstate->dtms_strtok)
 4915                          * would fail this access check.
 4916                          */
 4917                         if (!dtrace_strcanload(addr, size, &clim, mstate,
 4918                             vstate)) {
 4919                                 regs[rd] = 0;
 4920                                 break;
 4921                         }
 4922                         limit = addr + clim;
 4923                 }
 4924 
 4925                 /*
 4926                  * First, zero the token map, and then process the token
 4927                  * string -- setting a bit in the map for every character
 4928                  * found in the token string.
 4929                  */
 4930                 for (i = 0; i < sizeof (tokmap); i++)
 4931                         tokmap[i] = 0;
 4932 
 4933                 for (; tokaddr < toklimit; tokaddr++) {
 4934                         if ((c = dtrace_load8(tokaddr)) == '\0')
 4935                                 break;
 4936 
 4937                         ASSERT((c >> 3) < sizeof (tokmap));
 4938                         tokmap[c >> 3] |= (1 << (c & 0x7));
 4939                 }
 4940 
 4941                 for (; addr < limit; addr++) {
 4942                         /*
 4943                          * We're looking for a character that is _not_
 4944                          * contained in the token string.
 4945                          */
 4946                         if ((c = dtrace_load8(addr)) == '\0')
 4947                                 break;
 4948 
 4949                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
 4950                                 break;
 4951                 }
 4952 
 4953                 if (c == '\0') {
 4954                         /*
 4955                          * We reached the end of the string without finding
 4956                          * any character that was not in the token string.
 4957                          * We return NULL in this case, and we set the saved
 4958                          * address to NULL as well.
 4959                          */
 4960                         regs[rd] = 0;
 4961                         mstate->dtms_strtok = 0;
 4962                         mstate->dtms_strtok_limit = 0;
 4963                         break;
 4964                 }
 4965 
 4966                 /*
 4967                  * From here on, we're copying into the destination string.
 4968                  */
 4969                 for (i = 0; addr < limit && i < size - 1; addr++) {
 4970                         if ((c = dtrace_load8(addr)) == '\0')
 4971                                 break;
 4972 
 4973                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
 4974                                 break;
 4975 
 4976                         ASSERT(i < size);
 4977                         dest[i++] = c;
 4978                 }
 4979 
 4980                 ASSERT(i < size);
 4981                 dest[i] = '\0';
 4982                 regs[rd] = (uintptr_t)dest;
 4983                 mstate->dtms_scratch_ptr += size;
 4984                 mstate->dtms_strtok = addr;
 4985                 mstate->dtms_strtok_limit = limit;
 4986                 break;
 4987         }
 4988 
 4989         case DIF_SUBR_SUBSTR: {
 4990                 uintptr_t s = tupregs[0].dttk_value;
 4991                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 4992                 char *d = (char *)mstate->dtms_scratch_ptr;
 4993                 int64_t index = (int64_t)tupregs[1].dttk_value;
 4994                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
 4995                 size_t len = dtrace_strlen((char *)s, size);
 4996                 int64_t i;
 4997 
 4998                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
 4999                         regs[rd] = 0;
 5000                         break;
 5001                 }
 5002 
 5003                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5004                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5005                         regs[rd] = 0;
 5006                         break;
 5007                 }
 5008 
 5009                 if (nargs <= 2)
 5010                         remaining = (int64_t)size;
 5011 
 5012                 if (index < 0) {
 5013                         index += len;
 5014 
 5015                         if (index < 0 && index + remaining > 0) {
 5016                                 remaining += index;
 5017                                 index = 0;
 5018                         }
 5019                 }
 5020 
 5021                 if (index >= len || index < 0) {
 5022                         remaining = 0;
 5023                 } else if (remaining < 0) {
 5024                         remaining += len - index;
 5025                 } else if (index + remaining > size) {
 5026                         remaining = size - index;
 5027                 }
 5028 
 5029                 for (i = 0; i < remaining; i++) {
 5030                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
 5031                                 break;
 5032                 }
 5033 
 5034                 d[i] = '\0';
 5035 
 5036                 mstate->dtms_scratch_ptr += size;
 5037                 regs[rd] = (uintptr_t)d;
 5038                 break;
 5039         }
 5040 
 5041         case DIF_SUBR_JSON: {
 5042                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5043                 uintptr_t json = tupregs[0].dttk_value;
 5044                 size_t jsonlen = dtrace_strlen((char *)json, size);
 5045                 uintptr_t elem = tupregs[1].dttk_value;
 5046                 size_t elemlen = dtrace_strlen((char *)elem, size);
 5047 
 5048                 char *dest = (char *)mstate->dtms_scratch_ptr;
 5049                 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
 5050                 char *ee = elemlist;
 5051                 int nelems = 1;
 5052                 uintptr_t cur;
 5053 
 5054                 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
 5055                     !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
 5056                         regs[rd] = 0;
 5057                         break;
 5058                 }
 5059 
 5060                 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
 5061                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5062                         regs[rd] = 0;
 5063                         break;
 5064                 }
 5065 
 5066                 /*
 5067                  * Read the element selector and split it up into a packed list
 5068                  * of strings.
 5069                  */
 5070                 for (cur = elem; cur < elem + elemlen; cur++) {
 5071                         char cc = dtrace_load8(cur);
 5072 
 5073                         if (cur == elem && cc == '[') {
 5074                                 /*
 5075                                  * If the first element selector key is
 5076                                  * actually an array index then ignore the
 5077                                  * bracket.
 5078                                  */
 5079                                 continue;
 5080                         }
 5081 
 5082                         if (cc == ']')
 5083                                 continue;
 5084 
 5085                         if (cc == '.' || cc == '[') {
 5086                                 nelems++;
 5087                                 cc = '\0';
 5088                         }
 5089 
 5090                         *ee++ = cc;
 5091                 }
 5092                 *ee++ = '\0';
 5093 
 5094                 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
 5095                     nelems, dest)) != 0)
 5096                         mstate->dtms_scratch_ptr += jsonlen + 1;
 5097                 break;
 5098         }
 5099 
 5100         case DIF_SUBR_TOUPPER:
 5101         case DIF_SUBR_TOLOWER: {
 5102                 uintptr_t s = tupregs[0].dttk_value;
 5103                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5104                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
 5105                 size_t len = dtrace_strlen((char *)s, size);
 5106                 char lower, upper, convert;
 5107                 int64_t i;
 5108 
 5109                 if (subr == DIF_SUBR_TOUPPER) {
 5110                         lower = 'a';
 5111                         upper = 'z';
 5112                         convert = 'A';
 5113                 } else {
 5114                         lower = 'A';
 5115                         upper = 'Z';
 5116                         convert = 'a';
 5117                 }
 5118 
 5119                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
 5120                         regs[rd] = 0;
 5121                         break;
 5122                 }
 5123 
 5124                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5125                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5126                         regs[rd] = 0;
 5127                         break;
 5128                 }
 5129 
 5130                 for (i = 0; i < size - 1; i++) {
 5131                         if ((c = dtrace_load8(s + i)) == '\0')
 5132                                 break;
 5133 
 5134                         if (c >= lower && c <= upper)
 5135                                 c = convert + (c - lower);
 5136 
 5137                         dest[i] = c;
 5138                 }
 5139 
 5140                 ASSERT(i < size);
 5141                 dest[i] = '\0';
 5142                 regs[rd] = (uintptr_t)dest;
 5143                 mstate->dtms_scratch_ptr += size;
 5144                 break;
 5145         }
 5146 
 5147 #ifdef illumos
 5148         case DIF_SUBR_GETMAJOR:
 5149 #ifdef _LP64
 5150                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
 5151 #else
 5152                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
 5153 #endif
 5154                 break;
 5155 
 5156         case DIF_SUBR_GETMINOR:
 5157 #ifdef _LP64
 5158                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
 5159 #else
 5160                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
 5161 #endif
 5162                 break;
 5163 
 5164         case DIF_SUBR_DDI_PATHNAME: {
 5165                 /*
 5166                  * This one is a galactic mess.  We are going to roughly
 5167                  * emulate ddi_pathname(), but it's made more complicated
 5168                  * by the fact that we (a) want to include the minor name and
 5169                  * (b) must proceed iteratively instead of recursively.
 5170                  */
 5171                 uintptr_t dest = mstate->dtms_scratch_ptr;
 5172                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5173                 char *start = (char *)dest, *end = start + size - 1;
 5174                 uintptr_t daddr = tupregs[0].dttk_value;
 5175                 int64_t minor = (int64_t)tupregs[1].dttk_value;
 5176                 char *s;
 5177                 int i, len, depth = 0;
 5178 
 5179                 /*
 5180                  * Due to all the pointer jumping we do and context we must
 5181                  * rely upon, we just mandate that the user must have kernel
 5182                  * read privileges to use this routine.
 5183                  */
 5184                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
 5185                         *flags |= CPU_DTRACE_KPRIV;
 5186                         *illval = daddr;
 5187                         regs[rd] = 0;
 5188                 }
 5189 
 5190                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5191                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5192                         regs[rd] = 0;
 5193                         break;
 5194                 }
 5195 
 5196                 *end = '\0';
 5197 
 5198                 /*
 5199                  * We want to have a name for the minor.  In order to do this,
 5200                  * we need to walk the minor list from the devinfo.  We want
 5201                  * to be sure that we don't infinitely walk a circular list,
 5202                  * so we check for circularity by sending a scout pointer
 5203                  * ahead two elements for every element that we iterate over;
 5204                  * if the list is circular, these will ultimately point to the
 5205                  * same element.  You may recognize this little trick as the
 5206                  * answer to a stupid interview question -- one that always
 5207                  * seems to be asked by those who had to have it laboriously
 5208                  * explained to them, and who can't even concisely describe
 5209                  * the conditions under which one would be forced to resort to
 5210                  * this technique.  Needless to say, those conditions are
 5211                  * found here -- and probably only here.  Is this the only use
 5212                  * of this infamous trick in shipping, production code?  If it
 5213                  * isn't, it probably should be...
 5214                  */
 5215                 if (minor != -1) {
 5216                         uintptr_t maddr = dtrace_loadptr(daddr +
 5217                             offsetof(struct dev_info, devi_minor));
 5218 
 5219                         uintptr_t next = offsetof(struct ddi_minor_data, next);
 5220                         uintptr_t name = offsetof(struct ddi_minor_data,
 5221                             d_minor) + offsetof(struct ddi_minor, name);
 5222                         uintptr_t dev = offsetof(struct ddi_minor_data,
 5223                             d_minor) + offsetof(struct ddi_minor, dev);
 5224                         uintptr_t scout;
 5225 
 5226                         if (maddr != NULL)
 5227                                 scout = dtrace_loadptr(maddr + next);
 5228 
 5229                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
 5230                                 uint64_t m;
 5231 #ifdef _LP64
 5232                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
 5233 #else
 5234                                 m = dtrace_load32(maddr + dev) & MAXMIN;
 5235 #endif
 5236                                 if (m != minor) {
 5237                                         maddr = dtrace_loadptr(maddr + next);
 5238 
 5239                                         if (scout == NULL)
 5240                                                 continue;
 5241 
 5242                                         scout = dtrace_loadptr(scout + next);
 5243 
 5244                                         if (scout == NULL)
 5245                                                 continue;
 5246 
 5247                                         scout = dtrace_loadptr(scout + next);
 5248 
 5249                                         if (scout == NULL)
 5250                                                 continue;
 5251 
 5252                                         if (scout == maddr) {
 5253                                                 *flags |= CPU_DTRACE_ILLOP;
 5254                                                 break;
 5255                                         }
 5256 
 5257                                         continue;
 5258                                 }
 5259 
 5260                                 /*
 5261                                  * We have the minor data.  Now we need to
 5262                                  * copy the minor's name into the end of the
 5263                                  * pathname.
 5264                                  */
 5265                                 s = (char *)dtrace_loadptr(maddr + name);
 5266                                 len = dtrace_strlen(s, size);
 5267 
 5268                                 if (*flags & CPU_DTRACE_FAULT)
 5269                                         break;
 5270 
 5271                                 if (len != 0) {
 5272                                         if ((end -= (len + 1)) < start)
 5273                                                 break;
 5274 
 5275                                         *end = ':';
 5276                                 }
 5277 
 5278                                 for (i = 1; i <= len; i++)
 5279                                         end[i] = dtrace_load8((uintptr_t)s++);
 5280                                 break;
 5281                         }
 5282                 }
 5283 
 5284                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
 5285                         ddi_node_state_t devi_state;
 5286 
 5287                         devi_state = dtrace_load32(daddr +
 5288                             offsetof(struct dev_info, devi_node_state));
 5289 
 5290                         if (*flags & CPU_DTRACE_FAULT)
 5291                                 break;
 5292 
 5293                         if (devi_state >= DS_INITIALIZED) {
 5294                                 s = (char *)dtrace_loadptr(daddr +
 5295                                     offsetof(struct dev_info, devi_addr));
 5296                                 len = dtrace_strlen(s, size);
 5297 
 5298                                 if (*flags & CPU_DTRACE_FAULT)
 5299                                         break;
 5300 
 5301                                 if (len != 0) {
 5302                                         if ((end -= (len + 1)) < start)
 5303                                                 break;
 5304 
 5305                                         *end = '@';
 5306                                 }
 5307 
 5308                                 for (i = 1; i <= len; i++)
 5309                                         end[i] = dtrace_load8((uintptr_t)s++);
 5310                         }
 5311 
 5312                         /*
 5313                          * Now for the node name...
 5314                          */
 5315                         s = (char *)dtrace_loadptr(daddr +
 5316                             offsetof(struct dev_info, devi_node_name));
 5317 
 5318                         daddr = dtrace_loadptr(daddr +
 5319                             offsetof(struct dev_info, devi_parent));
 5320 
 5321                         /*
 5322                          * If our parent is NULL (that is, if we're the root
 5323                          * node), we're going to use the special path
 5324                          * "devices".
 5325                          */
 5326                         if (daddr == 0)
 5327                                 s = "devices";
 5328 
 5329                         len = dtrace_strlen(s, size);
 5330                         if (*flags & CPU_DTRACE_FAULT)
 5331                                 break;
 5332 
 5333                         if ((end -= (len + 1)) < start)
 5334                                 break;
 5335 
 5336                         for (i = 1; i <= len; i++)
 5337                                 end[i] = dtrace_load8((uintptr_t)s++);
 5338                         *end = '/';
 5339 
 5340                         if (depth++ > dtrace_devdepth_max) {
 5341                                 *flags |= CPU_DTRACE_ILLOP;
 5342                                 break;
 5343                         }
 5344                 }
 5345 
 5346                 if (end < start)
 5347                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5348 
 5349                 if (daddr == 0) {
 5350                         regs[rd] = (uintptr_t)end;
 5351                         mstate->dtms_scratch_ptr += size;
 5352                 }
 5353 
 5354                 break;
 5355         }
 5356 #endif
 5357 
 5358         case DIF_SUBR_STRJOIN: {
 5359                 char *d = (char *)mstate->dtms_scratch_ptr;
 5360                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5361                 uintptr_t s1 = tupregs[0].dttk_value;
 5362                 uintptr_t s2 = tupregs[1].dttk_value;
 5363                 int i = 0, j = 0;
 5364                 size_t lim1, lim2;
 5365                 char c;
 5366 
 5367                 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
 5368                     !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
 5369                         regs[rd] = 0;
 5370                         break;
 5371                 }
 5372 
 5373                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5374                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5375                         regs[rd] = 0;
 5376                         break;
 5377                 }
 5378 
 5379                 for (;;) {
 5380                         if (i >= size) {
 5381                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5382                                 regs[rd] = 0;
 5383                                 break;
 5384                         }
 5385                         c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
 5386                         if ((d[i++] = c) == '\0') {
 5387                                 i--;
 5388                                 break;
 5389                         }
 5390                 }
 5391 
 5392                 for (;;) {
 5393                         if (i >= size) {
 5394                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5395                                 regs[rd] = 0;
 5396                                 break;
 5397                         }
 5398 
 5399                         c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
 5400                         if ((d[i++] = c) == '\0')
 5401                                 break;
 5402                 }
 5403 
 5404                 if (i < size) {
 5405                         mstate->dtms_scratch_ptr += i;
 5406                         regs[rd] = (uintptr_t)d;
 5407                 }
 5408 
 5409                 break;
 5410         }
 5411 
 5412         case DIF_SUBR_STRTOLL: {
 5413                 uintptr_t s = tupregs[0].dttk_value;
 5414                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5415                 size_t lim;
 5416                 int base = 10;
 5417 
 5418                 if (nargs > 1) {
 5419                         if ((base = tupregs[1].dttk_value) <= 1 ||
 5420                             base > ('z' - 'a' + 1) + ('9' - '' + 1)) {
 5421                                 *flags |= CPU_DTRACE_ILLOP;
 5422                                 break;
 5423                         }
 5424                 }
 5425 
 5426                 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
 5427                         regs[rd] = INT64_MIN;
 5428                         break;
 5429                 }
 5430 
 5431                 regs[rd] = dtrace_strtoll((char *)s, base, lim);
 5432                 break;
 5433         }
 5434 
 5435         case DIF_SUBR_LLTOSTR: {
 5436                 int64_t i = (int64_t)tupregs[0].dttk_value;
 5437                 uint64_t val, digit;
 5438                 uint64_t size = 65;     /* enough room for 2^64 in binary */
 5439                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
 5440                 int base = 10;
 5441 
 5442                 if (nargs > 1) {
 5443                         if ((base = tupregs[1].dttk_value) <= 1 ||
 5444                             base > ('z' - 'a' + 1) + ('9' - '' + 1)) {
 5445                                 *flags |= CPU_DTRACE_ILLOP;
 5446                                 break;
 5447                         }
 5448                 }
 5449 
 5450                 val = (base == 10 && i < 0) ? i * -1 : i;
 5451 
 5452                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5453                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5454                         regs[rd] = 0;
 5455                         break;
 5456                 }
 5457 
 5458                 for (*end-- = '\0'; val; val /= base) {
 5459                         if ((digit = val % base) <= '9' - '') {
 5460                                 *end-- = '' + digit;
 5461                         } else {
 5462                                 *end-- = 'a' + (digit - ('9' - '') - 1);
 5463                         }
 5464                 }
 5465 
 5466                 if (i == 0 && base == 16)
 5467                         *end-- = '';
 5468 
 5469                 if (base == 16)
 5470                         *end-- = 'x';
 5471 
 5472                 if (i == 0 || base == 8 || base == 16)
 5473                         *end-- = '';
 5474 
 5475                 if (i < 0 && base == 10)
 5476                         *end-- = '-';
 5477 
 5478                 regs[rd] = (uintptr_t)end + 1;
 5479                 mstate->dtms_scratch_ptr += size;
 5480                 break;
 5481         }
 5482 
 5483         case DIF_SUBR_HTONS:
 5484         case DIF_SUBR_NTOHS:
 5485 #if BYTE_ORDER == BIG_ENDIAN
 5486                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
 5487 #else
 5488                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
 5489 #endif
 5490                 break;
 5491 
 5492 
 5493         case DIF_SUBR_HTONL:
 5494         case DIF_SUBR_NTOHL:
 5495 #if BYTE_ORDER == BIG_ENDIAN
 5496                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
 5497 #else
 5498                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
 5499 #endif
 5500                 break;
 5501 
 5502 
 5503         case DIF_SUBR_HTONLL:
 5504         case DIF_SUBR_NTOHLL:
 5505 #if BYTE_ORDER == BIG_ENDIAN
 5506                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
 5507 #else
 5508                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
 5509 #endif
 5510                 break;
 5511 
 5512 
 5513         case DIF_SUBR_DIRNAME:
 5514         case DIF_SUBR_BASENAME: {
 5515                 char *dest = (char *)mstate->dtms_scratch_ptr;
 5516                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5517                 uintptr_t src = tupregs[0].dttk_value;
 5518                 int i, j, len = dtrace_strlen((char *)src, size);
 5519                 int lastbase = -1, firstbase = -1, lastdir = -1;
 5520                 int start, end;
 5521 
 5522                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
 5523                         regs[rd] = 0;
 5524                         break;
 5525                 }
 5526 
 5527                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5528                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5529                         regs[rd] = 0;
 5530                         break;
 5531                 }
 5532 
 5533                 /*
 5534                  * The basename and dirname for a zero-length string is
 5535                  * defined to be "."
 5536                  */
 5537                 if (len == 0) {
 5538                         len = 1;
 5539                         src = (uintptr_t)".";
 5540                 }
 5541 
 5542                 /*
 5543                  * Start from the back of the string, moving back toward the
 5544                  * front until we see a character that isn't a slash.  That
 5545                  * character is the last character in the basename.
 5546                  */
 5547                 for (i = len - 1; i >= 0; i--) {
 5548                         if (dtrace_load8(src + i) != '/')
 5549                                 break;
 5550                 }
 5551 
 5552                 if (i >= 0)
 5553                         lastbase = i;
 5554 
 5555                 /*
 5556                  * Starting from the last character in the basename, move
 5557                  * towards the front until we find a slash.  The character
 5558                  * that we processed immediately before that is the first
 5559                  * character in the basename.
 5560                  */
 5561                 for (; i >= 0; i--) {
 5562                         if (dtrace_load8(src + i) == '/')
 5563                                 break;
 5564                 }
 5565 
 5566                 if (i >= 0)
 5567                         firstbase = i + 1;
 5568 
 5569                 /*
 5570                  * Now keep going until we find a non-slash character.  That
 5571                  * character is the last character in the dirname.
 5572                  */
 5573                 for (; i >= 0; i--) {
 5574                         if (dtrace_load8(src + i) != '/')
 5575                                 break;
 5576                 }
 5577 
 5578                 if (i >= 0)
 5579                         lastdir = i;
 5580 
 5581                 ASSERT(!(lastbase == -1 && firstbase != -1));
 5582                 ASSERT(!(firstbase == -1 && lastdir != -1));
 5583 
 5584                 if (lastbase == -1) {
 5585                         /*
 5586                          * We didn't find a non-slash character.  We know that
 5587                          * the length is non-zero, so the whole string must be
 5588                          * slashes.  In either the dirname or the basename
 5589                          * case, we return '/'.
 5590                          */
 5591                         ASSERT(firstbase == -1);
 5592                         firstbase = lastbase = lastdir = 0;
 5593                 }
 5594 
 5595                 if (firstbase == -1) {
 5596                         /*
 5597                          * The entire string consists only of a basename
 5598                          * component.  If we're looking for dirname, we need
 5599                          * to change our string to be just "."; if we're
 5600                          * looking for a basename, we'll just set the first
 5601                          * character of the basename to be 0.
 5602                          */
 5603                         if (subr == DIF_SUBR_DIRNAME) {
 5604                                 ASSERT(lastdir == -1);
 5605                                 src = (uintptr_t)".";
 5606                                 lastdir = 0;
 5607                         } else {
 5608                                 firstbase = 0;
 5609                         }
 5610                 }
 5611 
 5612                 if (subr == DIF_SUBR_DIRNAME) {
 5613                         if (lastdir == -1) {
 5614                                 /*
 5615                                  * We know that we have a slash in the name --
 5616                                  * or lastdir would be set to 0, above.  And
 5617                                  * because lastdir is -1, we know that this
 5618                                  * slash must be the first character.  (That
 5619                                  * is, the full string must be of the form
 5620                                  * "/basename".)  In this case, the last
 5621                                  * character of the directory name is 0.
 5622                                  */
 5623                                 lastdir = 0;
 5624                         }
 5625 
 5626                         start = 0;
 5627                         end = lastdir;
 5628                 } else {
 5629                         ASSERT(subr == DIF_SUBR_BASENAME);
 5630                         ASSERT(firstbase != -1 && lastbase != -1);
 5631                         start = firstbase;
 5632                         end = lastbase;
 5633                 }
 5634 
 5635                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
 5636                         dest[j] = dtrace_load8(src + i);
 5637 
 5638                 dest[j] = '\0';
 5639                 regs[rd] = (uintptr_t)dest;
 5640                 mstate->dtms_scratch_ptr += size;
 5641                 break;
 5642         }
 5643 
 5644         case DIF_SUBR_GETF: {
 5645                 uintptr_t fd = tupregs[0].dttk_value;
 5646                 struct filedesc *fdp;
 5647                 file_t *fp;
 5648 
 5649                 if (!dtrace_priv_proc(state)) {
 5650                         regs[rd] = 0;
 5651                         break;
 5652                 }
 5653                 fdp = curproc->p_fd;
 5654                 FILEDESC_SLOCK(fdp);
 5655                 /*
 5656                  * XXXMJG this looks broken as no ref is taken.
 5657                  */
 5658                 fp = fget_noref(fdp, fd);
 5659                 mstate->dtms_getf = fp;
 5660                 regs[rd] = (uintptr_t)fp;
 5661                 FILEDESC_SUNLOCK(fdp);
 5662                 break;
 5663         }
 5664 
 5665         case DIF_SUBR_CLEANPATH: {
 5666                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
 5667                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
 5668                 uintptr_t src = tupregs[0].dttk_value;
 5669                 size_t lim;
 5670                 int i = 0, j = 0;
 5671 #ifdef illumos
 5672                 zone_t *z;
 5673 #endif
 5674 
 5675                 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
 5676                         regs[rd] = 0;
 5677                         break;
 5678                 }
 5679 
 5680                 if (!DTRACE_INSCRATCH(mstate, size)) {
 5681                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5682                         regs[rd] = 0;
 5683                         break;
 5684                 }
 5685 
 5686                 /*
 5687                  * Move forward, loading each character.
 5688                  */
 5689                 do {
 5690                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 5691 next:
 5692                         if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
 5693                                 break;
 5694 
 5695                         if (c != '/') {
 5696                                 dest[j++] = c;
 5697                                 continue;
 5698                         }
 5699 
 5700                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 5701 
 5702                         if (c == '/') {
 5703                                 /*
 5704                                  * We have two slashes -- we can just advance
 5705                                  * to the next character.
 5706                                  */
 5707                                 goto next;
 5708                         }
 5709 
 5710                         if (c != '.') {
 5711                                 /*
 5712                                  * This is not "." and it's not ".." -- we can
 5713                                  * just store the "/" and this character and
 5714                                  * drive on.
 5715                                  */
 5716                                 dest[j++] = '/';
 5717                                 dest[j++] = c;
 5718                                 continue;
 5719                         }
 5720 
 5721                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 5722 
 5723                         if (c == '/') {
 5724                                 /*
 5725                                  * This is a "/./" component.  We're not going
 5726                                  * to store anything in the destination buffer;
 5727                                  * we're just going to go to the next component.
 5728                                  */
 5729                                 goto next;
 5730                         }
 5731 
 5732                         if (c != '.') {
 5733                                 /*
 5734                                  * This is not ".." -- we can just store the
 5735                                  * "/." and this character and continue
 5736                                  * processing.
 5737                                  */
 5738                                 dest[j++] = '/';
 5739                                 dest[j++] = '.';
 5740                                 dest[j++] = c;
 5741                                 continue;
 5742                         }
 5743 
 5744                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
 5745 
 5746                         if (c != '/' && c != '\0') {
 5747                                 /*
 5748                                  * This is not ".." -- it's "..[mumble]".
 5749                                  * We'll store the "/.." and this character
 5750                                  * and continue processing.
 5751                                  */
 5752                                 dest[j++] = '/';
 5753                                 dest[j++] = '.';
 5754                                 dest[j++] = '.';
 5755                                 dest[j++] = c;
 5756                                 continue;
 5757                         }
 5758 
 5759                         /*
 5760                          * This is "/../" or "/..\0".  We need to back up
 5761                          * our destination pointer until we find a "/".
 5762                          */
 5763                         i--;
 5764                         while (j != 0 && dest[--j] != '/')
 5765                                 continue;
 5766 
 5767                         if (c == '\0')
 5768                                 dest[++j] = '/';
 5769                 } while (c != '\0');
 5770 
 5771                 dest[j] = '\0';
 5772 
 5773 #ifdef illumos
 5774                 if (mstate->dtms_getf != NULL &&
 5775                     !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
 5776                     (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
 5777                         /*
 5778                          * If we've done a getf() as a part of this ECB and we
 5779                          * don't have kernel access (and we're not in the global
 5780                          * zone), check if the path we cleaned up begins with
 5781                          * the zone's root path, and trim it off if so.  Note
 5782                          * that this is an output cleanliness issue, not a
 5783                          * security issue: knowing one's zone root path does
 5784                          * not enable privilege escalation.
 5785                          */
 5786                         if (strstr(dest, z->zone_rootpath) == dest)
 5787                                 dest += strlen(z->zone_rootpath) - 1;
 5788                 }
 5789 #endif
 5790 
 5791                 regs[rd] = (uintptr_t)dest;
 5792                 mstate->dtms_scratch_ptr += size;
 5793                 break;
 5794         }
 5795 
 5796         case DIF_SUBR_INET_NTOA:
 5797         case DIF_SUBR_INET_NTOA6:
 5798         case DIF_SUBR_INET_NTOP: {
 5799                 size_t size;
 5800                 int af, argi, i;
 5801                 char *base, *end;
 5802 
 5803                 if (subr == DIF_SUBR_INET_NTOP) {
 5804                         af = (int)tupregs[0].dttk_value;
 5805                         argi = 1;
 5806                 } else {
 5807                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
 5808                         argi = 0;
 5809                 }
 5810 
 5811                 if (af == AF_INET) {
 5812                         ipaddr_t ip4;
 5813                         uint8_t *ptr8, val;
 5814 
 5815                         if (!dtrace_canload(tupregs[argi].dttk_value,
 5816                             sizeof (ipaddr_t), mstate, vstate)) {
 5817                                 regs[rd] = 0;
 5818                                 break;
 5819                         }
 5820 
 5821                         /*
 5822                          * Safely load the IPv4 address.
 5823                          */
 5824                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
 5825 
 5826                         /*
 5827                          * Check an IPv4 string will fit in scratch.
 5828                          */
 5829                         size = INET_ADDRSTRLEN;
 5830                         if (!DTRACE_INSCRATCH(mstate, size)) {
 5831                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5832                                 regs[rd] = 0;
 5833                                 break;
 5834                         }
 5835                         base = (char *)mstate->dtms_scratch_ptr;
 5836                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
 5837 
 5838                         /*
 5839                          * Stringify as a dotted decimal quad.
 5840                          */
 5841                         *end-- = '\0';
 5842                         ptr8 = (uint8_t *)&ip4;
 5843                         for (i = 3; i >= 0; i--) {
 5844                                 val = ptr8[i];
 5845 
 5846                                 if (val == 0) {
 5847                                         *end-- = '';
 5848                                 } else {
 5849                                         for (; val; val /= 10) {
 5850                                                 *end-- = '' + (val % 10);
 5851                                         }
 5852                                 }
 5853 
 5854                                 if (i > 0)
 5855                                         *end-- = '.';
 5856                         }
 5857                         ASSERT(end + 1 >= base);
 5858 
 5859                 } else if (af == AF_INET6) {
 5860                         struct in6_addr ip6;
 5861                         int firstzero, tryzero, numzero, v6end;
 5862                         uint16_t val;
 5863                         const char digits[] = "0123456789abcdef";
 5864 
 5865                         /*
 5866                          * Stringify using RFC 1884 convention 2 - 16 bit
 5867                          * hexadecimal values with a zero-run compression.
 5868                          * Lower case hexadecimal digits are used.
 5869                          *      eg, fe80::214:4fff:fe0b:76c8.
 5870                          * The IPv4 embedded form is returned for inet_ntop,
 5871                          * just the IPv4 string is returned for inet_ntoa6.
 5872                          */
 5873 
 5874                         if (!dtrace_canload(tupregs[argi].dttk_value,
 5875                             sizeof (struct in6_addr), mstate, vstate)) {
 5876                                 regs[rd] = 0;
 5877                                 break;
 5878                         }
 5879 
 5880                         /*
 5881                          * Safely load the IPv6 address.
 5882                          */
 5883                         dtrace_bcopy(
 5884                             (void *)(uintptr_t)tupregs[argi].dttk_value,
 5885                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
 5886 
 5887                         /*
 5888                          * Check an IPv6 string will fit in scratch.
 5889                          */
 5890                         size = INET6_ADDRSTRLEN;
 5891                         if (!DTRACE_INSCRATCH(mstate, size)) {
 5892                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 5893                                 regs[rd] = 0;
 5894                                 break;
 5895                         }
 5896                         base = (char *)mstate->dtms_scratch_ptr;
 5897                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
 5898                         *end-- = '\0';
 5899 
 5900                         /*
 5901                          * Find the longest run of 16 bit zero values
 5902                          * for the single allowed zero compression - "::".
 5903                          */
 5904                         firstzero = -1;
 5905                         tryzero = -1;
 5906                         numzero = 1;
 5907                         for (i = 0; i < sizeof (struct in6_addr); i++) {
 5908 #ifdef illumos
 5909                                 if (ip6._S6_un._S6_u8[i] == 0 &&
 5910 #else
 5911                                 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
 5912 #endif
 5913                                     tryzero == -1 && i % 2 == 0) {
 5914                                         tryzero = i;
 5915                                         continue;
 5916                                 }
 5917 
 5918                                 if (tryzero != -1 &&
 5919 #ifdef illumos
 5920                                     (ip6._S6_un._S6_u8[i] != 0 ||
 5921 #else
 5922                                     (ip6.__u6_addr.__u6_addr8[i] != 0 ||
 5923 #endif
 5924                                     i == sizeof (struct in6_addr) - 1)) {
 5925 
 5926                                         if (i - tryzero <= numzero) {
 5927                                                 tryzero = -1;
 5928                                                 continue;
 5929                                         }
 5930 
 5931                                         firstzero = tryzero;
 5932                                         numzero = i - i % 2 - tryzero;
 5933                                         tryzero = -1;
 5934 
 5935 #ifdef illumos
 5936                                         if (ip6._S6_un._S6_u8[i] == 0 &&
 5937 #else
 5938                                         if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
 5939 #endif
 5940                                             i == sizeof (struct in6_addr) - 1)
 5941                                                 numzero += 2;
 5942                                 }
 5943                         }
 5944                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
 5945 
 5946                         /*
 5947                          * Check for an IPv4 embedded address.
 5948                          */
 5949                         v6end = sizeof (struct in6_addr) - 2;
 5950                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
 5951                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
 5952                                 for (i = sizeof (struct in6_addr) - 1;
 5953                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
 5954                                         ASSERT(end >= base);
 5955 
 5956 #ifdef illumos
 5957                                         val = ip6._S6_un._S6_u8[i];
 5958 #else
 5959                                         val = ip6.__u6_addr.__u6_addr8[i];
 5960 #endif
 5961 
 5962                                         if (val == 0) {
 5963                                                 *end-- = '';
 5964                                         } else {
 5965                                                 for (; val; val /= 10) {
 5966                                                         *end-- = '' + val % 10;
 5967                                                 }
 5968                                         }
 5969 
 5970                                         if (i > DTRACE_V4MAPPED_OFFSET)
 5971                                                 *end-- = '.';
 5972                                 }
 5973 
 5974                                 if (subr == DIF_SUBR_INET_NTOA6)
 5975                                         goto inetout;
 5976 
 5977                                 /*
 5978                                  * Set v6end to skip the IPv4 address that
 5979                                  * we have already stringified.
 5980                                  */
 5981                                 v6end = 10;
 5982                         }
 5983 
 5984                         /*
 5985                          * Build the IPv6 string by working through the
 5986                          * address in reverse.
 5987                          */
 5988                         for (i = v6end; i >= 0; i -= 2) {
 5989                                 ASSERT(end >= base);
 5990 
 5991                                 if (i == firstzero + numzero - 2) {
 5992                                         *end-- = ':';
 5993                                         *end-- = ':';
 5994                                         i -= numzero - 2;
 5995                                         continue;
 5996                                 }
 5997 
 5998                                 if (i < 14 && i != firstzero - 2)
 5999                                         *end-- = ':';
 6000 
 6001 #ifdef illumos
 6002                                 val = (ip6._S6_un._S6_u8[i] << 8) +
 6003                                     ip6._S6_un._S6_u8[i + 1];
 6004 #else
 6005                                 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
 6006                                     ip6.__u6_addr.__u6_addr8[i + 1];
 6007 #endif
 6008 
 6009                                 if (val == 0) {
 6010                                         *end-- = '';
 6011                                 } else {
 6012                                         for (; val; val /= 16) {
 6013                                                 *end-- = digits[val % 16];
 6014                                         }
 6015                                 }
 6016                         }
 6017                         ASSERT(end + 1 >= base);
 6018 
 6019                 } else {
 6020                         /*
 6021                          * The user didn't use AH_INET or AH_INET6.
 6022                          */
 6023                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 6024                         regs[rd] = 0;
 6025                         break;
 6026                 }
 6027 
 6028 inetout:        regs[rd] = (uintptr_t)end + 1;
 6029                 mstate->dtms_scratch_ptr += size;
 6030                 break;
 6031         }
 6032 
 6033         case DIF_SUBR_MEMREF: {
 6034                 uintptr_t size = 2 * sizeof(uintptr_t);
 6035                 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
 6036                 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
 6037 
 6038                 /* address and length */
 6039                 memref[0] = tupregs[0].dttk_value;
 6040                 memref[1] = tupregs[1].dttk_value;
 6041 
 6042                 regs[rd] = (uintptr_t) memref;
 6043                 mstate->dtms_scratch_ptr += scratch_size;
 6044                 break;
 6045         }
 6046 
 6047 #ifndef illumos
 6048         case DIF_SUBR_MEMSTR: {
 6049                 char *str = (char *)mstate->dtms_scratch_ptr;
 6050                 uintptr_t mem = tupregs[0].dttk_value;
 6051                 char c = tupregs[1].dttk_value;
 6052                 size_t size = tupregs[2].dttk_value;
 6053                 uint8_t n;
 6054                 int i;
 6055 
 6056                 regs[rd] = 0;
 6057 
 6058                 if (size == 0)
 6059                         break;
 6060 
 6061                 if (!dtrace_canload(mem, size - 1, mstate, vstate))
 6062                         break;
 6063 
 6064                 if (!DTRACE_INSCRATCH(mstate, size)) {
 6065                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 6066                         break;
 6067                 }
 6068 
 6069                 if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
 6070                         *flags |= CPU_DTRACE_ILLOP;
 6071                         break;
 6072                 }
 6073 
 6074                 for (i = 0; i < size - 1; i++) {
 6075                         n = dtrace_load8(mem++);
 6076                         str[i] = (n == 0) ? c : n;
 6077                 }
 6078                 str[size - 1] = 0;
 6079 
 6080                 regs[rd] = (uintptr_t)str;
 6081                 mstate->dtms_scratch_ptr += size;
 6082                 break;
 6083         }
 6084 #endif
 6085         }
 6086 }
 6087 
 6088 /*
 6089  * Emulate the execution of DTrace IR instructions specified by the given
 6090  * DIF object.  This function is deliberately void of assertions as all of
 6091  * the necessary checks are handled by a call to dtrace_difo_validate().
 6092  */
 6093 static uint64_t
 6094 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
 6095     dtrace_vstate_t *vstate, dtrace_state_t *state)
 6096 {
 6097         const dif_instr_t *text = difo->dtdo_buf;
 6098         const uint_t textlen = difo->dtdo_len;
 6099         const char *strtab = difo->dtdo_strtab;
 6100         const uint64_t *inttab = difo->dtdo_inttab;
 6101 
 6102         uint64_t rval = 0;
 6103         dtrace_statvar_t *svar;
 6104         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 6105         dtrace_difv_t *v;
 6106         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
 6107         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
 6108 
 6109         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
 6110         uint64_t regs[DIF_DIR_NREGS];
 6111         uint64_t *tmp;
 6112 
 6113         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
 6114         int64_t cc_r;
 6115         uint_t pc = 0, id, opc = 0;
 6116         uint8_t ttop = 0;
 6117         dif_instr_t instr;
 6118         uint_t r1, r2, rd;
 6119 
 6120         /*
 6121          * We stash the current DIF object into the machine state: we need it
 6122          * for subsequent access checking.
 6123          */
 6124         mstate->dtms_difo = difo;
 6125 
 6126         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
 6127 
 6128         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
 6129                 opc = pc;
 6130 
 6131                 instr = text[pc++];
 6132                 r1 = DIF_INSTR_R1(instr);
 6133                 r2 = DIF_INSTR_R2(instr);
 6134                 rd = DIF_INSTR_RD(instr);
 6135 
 6136                 switch (DIF_INSTR_OP(instr)) {
 6137                 case DIF_OP_OR:
 6138                         regs[rd] = regs[r1] | regs[r2];
 6139                         break;
 6140                 case DIF_OP_XOR:
 6141                         regs[rd] = regs[r1] ^ regs[r2];
 6142                         break;
 6143                 case DIF_OP_AND:
 6144                         regs[rd] = regs[r1] & regs[r2];
 6145                         break;
 6146                 case DIF_OP_SLL:
 6147                         regs[rd] = regs[r1] << regs[r2];
 6148                         break;
 6149                 case DIF_OP_SRL:
 6150                         regs[rd] = regs[r1] >> regs[r2];
 6151                         break;
 6152                 case DIF_OP_SUB:
 6153                         regs[rd] = regs[r1] - regs[r2];
 6154                         break;
 6155                 case DIF_OP_ADD:
 6156                         regs[rd] = regs[r1] + regs[r2];
 6157                         break;
 6158                 case DIF_OP_MUL:
 6159                         regs[rd] = regs[r1] * regs[r2];
 6160                         break;
 6161                 case DIF_OP_SDIV:
 6162                         if (regs[r2] == 0) {
 6163                                 regs[rd] = 0;
 6164                                 *flags |= CPU_DTRACE_DIVZERO;
 6165                         } else {
 6166                                 regs[rd] = (int64_t)regs[r1] /
 6167                                     (int64_t)regs[r2];
 6168                         }
 6169                         break;
 6170 
 6171                 case DIF_OP_UDIV:
 6172                         if (regs[r2] == 0) {
 6173                                 regs[rd] = 0;
 6174                                 *flags |= CPU_DTRACE_DIVZERO;
 6175                         } else {
 6176                                 regs[rd] = regs[r1] / regs[r2];
 6177                         }
 6178                         break;
 6179 
 6180                 case DIF_OP_SREM:
 6181                         if (regs[r2] == 0) {
 6182                                 regs[rd] = 0;
 6183                                 *flags |= CPU_DTRACE_DIVZERO;
 6184                         } else {
 6185                                 regs[rd] = (int64_t)regs[r1] %
 6186                                     (int64_t)regs[r2];
 6187                         }
 6188                         break;
 6189 
 6190                 case DIF_OP_UREM:
 6191                         if (regs[r2] == 0) {
 6192                                 regs[rd] = 0;
 6193                                 *flags |= CPU_DTRACE_DIVZERO;
 6194                         } else {
 6195                                 regs[rd] = regs[r1] % regs[r2];
 6196                         }
 6197                         break;
 6198 
 6199                 case DIF_OP_NOT:
 6200                         regs[rd] = ~regs[r1];
 6201                         break;
 6202                 case DIF_OP_MOV:
 6203                         regs[rd] = regs[r1];
 6204                         break;
 6205                 case DIF_OP_CMP:
 6206                         cc_r = regs[r1] - regs[r2];
 6207                         cc_n = cc_r < 0;
 6208                         cc_z = cc_r == 0;
 6209                         cc_v = 0;
 6210                         cc_c = regs[r1] < regs[r2];
 6211                         break;
 6212                 case DIF_OP_TST:
 6213                         cc_n = cc_v = cc_c = 0;
 6214                         cc_z = regs[r1] == 0;
 6215                         break;
 6216                 case DIF_OP_BA:
 6217                         pc = DIF_INSTR_LABEL(instr);
 6218                         break;
 6219                 case DIF_OP_BE:
 6220                         if (cc_z)
 6221                                 pc = DIF_INSTR_LABEL(instr);
 6222                         break;
 6223                 case DIF_OP_BNE:
 6224                         if (cc_z == 0)
 6225                                 pc = DIF_INSTR_LABEL(instr);
 6226                         break;
 6227                 case DIF_OP_BG:
 6228                         if ((cc_z | (cc_n ^ cc_v)) == 0)
 6229                                 pc = DIF_INSTR_LABEL(instr);
 6230                         break;
 6231                 case DIF_OP_BGU:
 6232                         if ((cc_c | cc_z) == 0)
 6233                                 pc = DIF_INSTR_LABEL(instr);
 6234                         break;
 6235                 case DIF_OP_BGE:
 6236                         if ((cc_n ^ cc_v) == 0)
 6237                                 pc = DIF_INSTR_LABEL(instr);
 6238                         break;
 6239                 case DIF_OP_BGEU:
 6240                         if (cc_c == 0)
 6241                                 pc = DIF_INSTR_LABEL(instr);
 6242                         break;
 6243                 case DIF_OP_BL:
 6244                         if (cc_n ^ cc_v)
 6245                                 pc = DIF_INSTR_LABEL(instr);
 6246                         break;
 6247                 case DIF_OP_BLU:
 6248                         if (cc_c)
 6249                                 pc = DIF_INSTR_LABEL(instr);
 6250                         break;
 6251                 case DIF_OP_BLE:
 6252                         if (cc_z | (cc_n ^ cc_v))
 6253                                 pc = DIF_INSTR_LABEL(instr);
 6254                         break;
 6255                 case DIF_OP_BLEU:
 6256                         if (cc_c | cc_z)
 6257                                 pc = DIF_INSTR_LABEL(instr);
 6258                         break;
 6259                 case DIF_OP_RLDSB:
 6260                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 6261                                 break;
 6262                         /*FALLTHROUGH*/
 6263                 case DIF_OP_LDSB:
 6264                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
 6265                         break;
 6266                 case DIF_OP_RLDSH:
 6267                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 6268                                 break;
 6269                         /*FALLTHROUGH*/
 6270                 case DIF_OP_LDSH:
 6271                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
 6272                         break;
 6273                 case DIF_OP_RLDSW:
 6274                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 6275                                 break;
 6276                         /*FALLTHROUGH*/
 6277                 case DIF_OP_LDSW:
 6278                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
 6279                         break;
 6280                 case DIF_OP_RLDUB:
 6281                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
 6282                                 break;
 6283                         /*FALLTHROUGH*/
 6284                 case DIF_OP_LDUB:
 6285                         regs[rd] = dtrace_load8(regs[r1]);
 6286                         break;
 6287                 case DIF_OP_RLDUH:
 6288                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
 6289                                 break;
 6290                         /*FALLTHROUGH*/
 6291                 case DIF_OP_LDUH:
 6292                         regs[rd] = dtrace_load16(regs[r1]);
 6293                         break;
 6294                 case DIF_OP_RLDUW:
 6295                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
 6296                                 break;
 6297                         /*FALLTHROUGH*/
 6298                 case DIF_OP_LDUW:
 6299                         regs[rd] = dtrace_load32(regs[r1]);
 6300                         break;
 6301                 case DIF_OP_RLDX:
 6302                         if (!dtrace_canload(regs[r1], 8, mstate, vstate))
 6303                                 break;
 6304                         /*FALLTHROUGH*/
 6305                 case DIF_OP_LDX:
 6306                         regs[rd] = dtrace_load64(regs[r1]);
 6307                         break;
 6308                 case DIF_OP_ULDSB:
 6309                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6310                         regs[rd] = (int8_t)
 6311                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
 6312                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6313                         break;
 6314                 case DIF_OP_ULDSH:
 6315                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6316                         regs[rd] = (int16_t)
 6317                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
 6318                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6319                         break;
 6320                 case DIF_OP_ULDSW:
 6321                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6322                         regs[rd] = (int32_t)
 6323                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
 6324                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6325                         break;
 6326                 case DIF_OP_ULDUB:
 6327                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6328                         regs[rd] =
 6329                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
 6330                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6331                         break;
 6332                 case DIF_OP_ULDUH:
 6333                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6334                         regs[rd] =
 6335                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
 6336                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6337                         break;
 6338                 case DIF_OP_ULDUW:
 6339                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6340                         regs[rd] =
 6341                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
 6342                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6343                         break;
 6344                 case DIF_OP_ULDX:
 6345                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 6346                         regs[rd] =
 6347                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
 6348                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 6349                         break;
 6350                 case DIF_OP_RET:
 6351                         rval = regs[rd];
 6352                         pc = textlen;
 6353                         break;
 6354                 case DIF_OP_NOP:
 6355                         break;
 6356                 case DIF_OP_SETX:
 6357                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
 6358                         break;
 6359                 case DIF_OP_SETS:
 6360                         regs[rd] = (uint64_t)(uintptr_t)
 6361                             (strtab + DIF_INSTR_STRING(instr));
 6362                         break;
 6363                 case DIF_OP_SCMP: {
 6364                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
 6365                         uintptr_t s1 = regs[r1];
 6366                         uintptr_t s2 = regs[r2];
 6367                         size_t lim1, lim2;
 6368 
 6369                         /*
 6370                          * If one of the strings is NULL then the limit becomes
 6371                          * 0 which compares 0 characters in dtrace_strncmp()
 6372                          * resulting in a false positive.  dtrace_strncmp()
 6373                          * treats a NULL as an empty 1-char string.
 6374                          */
 6375                         lim1 = lim2 = 1;
 6376 
 6377                         if (s1 != 0 &&
 6378                             !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
 6379                                 break;
 6380                         if (s2 != 0 &&
 6381                             !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
 6382                                 break;
 6383 
 6384                         cc_r = dtrace_strncmp((char *)s1, (char *)s2,
 6385                             MIN(lim1, lim2));
 6386 
 6387                         cc_n = cc_r < 0;
 6388                         cc_z = cc_r == 0;
 6389                         cc_v = cc_c = 0;
 6390                         break;
 6391                 }
 6392                 case DIF_OP_LDGA:
 6393                         regs[rd] = dtrace_dif_variable(mstate, state,
 6394                             r1, regs[r2]);
 6395                         break;
 6396                 case DIF_OP_LDGS:
 6397                         id = DIF_INSTR_VAR(instr);
 6398 
 6399                         if (id >= DIF_VAR_OTHER_UBASE) {
 6400                                 uintptr_t a;
 6401 
 6402                                 id -= DIF_VAR_OTHER_UBASE;
 6403                                 svar = vstate->dtvs_globals[id];
 6404                                 ASSERT(svar != NULL);
 6405                                 v = &svar->dtsv_var;
 6406 
 6407                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
 6408                                         regs[rd] = svar->dtsv_data;
 6409                                         break;
 6410                                 }
 6411 
 6412                                 a = (uintptr_t)svar->dtsv_data;
 6413 
 6414                                 if (*(uint8_t *)a == UINT8_MAX) {
 6415                                         /*
 6416                                          * If the 0th byte is set to UINT8_MAX
 6417                                          * then this is to be treated as a
 6418                                          * reference to a NULL variable.
 6419                                          */
 6420                                         regs[rd] = 0;
 6421                                 } else {
 6422                                         regs[rd] = a + sizeof (uint64_t);
 6423                                 }
 6424 
 6425                                 break;
 6426                         }
 6427 
 6428                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
 6429                         break;
 6430 
 6431                 case DIF_OP_STGS:
 6432                         id = DIF_INSTR_VAR(instr);
 6433 
 6434                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6435                         id -= DIF_VAR_OTHER_UBASE;
 6436 
 6437                         VERIFY(id < vstate->dtvs_nglobals);
 6438                         svar = vstate->dtvs_globals[id];
 6439                         ASSERT(svar != NULL);
 6440                         v = &svar->dtsv_var;
 6441 
 6442                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6443                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
 6444                                 size_t lim;
 6445 
 6446                                 ASSERT(a != 0);
 6447                                 ASSERT(svar->dtsv_size != 0);
 6448 
 6449                                 if (regs[rd] == 0) {
 6450                                         *(uint8_t *)a = UINT8_MAX;
 6451                                         break;
 6452                                 } else {
 6453                                         *(uint8_t *)a = 0;
 6454                                         a += sizeof (uint64_t);
 6455                                 }
 6456                                 if (!dtrace_vcanload(
 6457                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
 6458                                     &lim, mstate, vstate))
 6459                                         break;
 6460 
 6461                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
 6462                                     (void *)a, &v->dtdv_type, lim);
 6463                                 break;
 6464                         }
 6465 
 6466                         svar->dtsv_data = regs[rd];
 6467                         break;
 6468 
 6469                 case DIF_OP_LDTA:
 6470                         /*
 6471                          * There are no DTrace built-in thread-local arrays at
 6472                          * present.  This opcode is saved for future work.
 6473                          */
 6474                         *flags |= CPU_DTRACE_ILLOP;
 6475                         regs[rd] = 0;
 6476                         break;
 6477 
 6478                 case DIF_OP_LDLS:
 6479                         id = DIF_INSTR_VAR(instr);
 6480 
 6481                         if (id < DIF_VAR_OTHER_UBASE) {
 6482                                 /*
 6483                                  * For now, this has no meaning.
 6484                                  */
 6485                                 regs[rd] = 0;
 6486                                 break;
 6487                         }
 6488 
 6489                         id -= DIF_VAR_OTHER_UBASE;
 6490 
 6491                         ASSERT(id < vstate->dtvs_nlocals);
 6492                         ASSERT(vstate->dtvs_locals != NULL);
 6493 
 6494                         svar = vstate->dtvs_locals[id];
 6495                         ASSERT(svar != NULL);
 6496                         v = &svar->dtsv_var;
 6497 
 6498                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6499                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
 6500                                 size_t sz = v->dtdv_type.dtdt_size;
 6501                                 size_t lim;
 6502 
 6503                                 sz += sizeof (uint64_t);
 6504                                 ASSERT(svar->dtsv_size == NCPU * sz);
 6505                                 a += curcpu * sz;
 6506 
 6507                                 if (*(uint8_t *)a == UINT8_MAX) {
 6508                                         /*
 6509                                          * If the 0th byte is set to UINT8_MAX
 6510                                          * then this is to be treated as a
 6511                                          * reference to a NULL variable.
 6512                                          */
 6513                                         regs[rd] = 0;
 6514                                 } else {
 6515                                         regs[rd] = a + sizeof (uint64_t);
 6516                                 }
 6517 
 6518                                 break;
 6519                         }
 6520 
 6521                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
 6522                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
 6523                         regs[rd] = tmp[curcpu];
 6524                         break;
 6525 
 6526                 case DIF_OP_STLS:
 6527                         id = DIF_INSTR_VAR(instr);
 6528 
 6529                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6530                         id -= DIF_VAR_OTHER_UBASE;
 6531                         VERIFY(id < vstate->dtvs_nlocals);
 6532 
 6533                         ASSERT(vstate->dtvs_locals != NULL);
 6534                         svar = vstate->dtvs_locals[id];
 6535                         ASSERT(svar != NULL);
 6536                         v = &svar->dtsv_var;
 6537 
 6538                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6539                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
 6540                                 size_t sz = v->dtdv_type.dtdt_size;
 6541                                 size_t lim;
 6542 
 6543                                 sz += sizeof (uint64_t);
 6544                                 ASSERT(svar->dtsv_size == NCPU * sz);
 6545                                 a += curcpu * sz;
 6546 
 6547                                 if (regs[rd] == 0) {
 6548                                         *(uint8_t *)a = UINT8_MAX;
 6549                                         break;
 6550                                 } else {
 6551                                         *(uint8_t *)a = 0;
 6552                                         a += sizeof (uint64_t);
 6553                                 }
 6554 
 6555                                 if (!dtrace_vcanload(
 6556                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
 6557                                     &lim, mstate, vstate))
 6558                                         break;
 6559 
 6560                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
 6561                                     (void *)a, &v->dtdv_type, lim);
 6562                                 break;
 6563                         }
 6564 
 6565                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
 6566                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
 6567                         tmp[curcpu] = regs[rd];
 6568                         break;
 6569 
 6570                 case DIF_OP_LDTS: {
 6571                         dtrace_dynvar_t *dvar;
 6572                         dtrace_key_t *key;
 6573 
 6574                         id = DIF_INSTR_VAR(instr);
 6575                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6576                         id -= DIF_VAR_OTHER_UBASE;
 6577                         v = &vstate->dtvs_tlocals[id];
 6578 
 6579                         key = &tupregs[DIF_DTR_NREGS];
 6580                         key[0].dttk_value = (uint64_t)id;
 6581                         key[0].dttk_size = 0;
 6582                         DTRACE_TLS_THRKEY(key[1].dttk_value);
 6583                         key[1].dttk_size = 0;
 6584 
 6585                         dvar = dtrace_dynvar(dstate, 2, key,
 6586                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
 6587                             mstate, vstate);
 6588 
 6589                         if (dvar == NULL) {
 6590                                 regs[rd] = 0;
 6591                                 break;
 6592                         }
 6593 
 6594                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6595                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
 6596                         } else {
 6597                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
 6598                         }
 6599 
 6600                         break;
 6601                 }
 6602 
 6603                 case DIF_OP_STTS: {
 6604                         dtrace_dynvar_t *dvar;
 6605                         dtrace_key_t *key;
 6606 
 6607                         id = DIF_INSTR_VAR(instr);
 6608                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6609                         id -= DIF_VAR_OTHER_UBASE;
 6610                         VERIFY(id < vstate->dtvs_ntlocals);
 6611 
 6612                         key = &tupregs[DIF_DTR_NREGS];
 6613                         key[0].dttk_value = (uint64_t)id;
 6614                         key[0].dttk_size = 0;
 6615                         DTRACE_TLS_THRKEY(key[1].dttk_value);
 6616                         key[1].dttk_size = 0;
 6617                         v = &vstate->dtvs_tlocals[id];
 6618 
 6619                         dvar = dtrace_dynvar(dstate, 2, key,
 6620                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
 6621                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
 6622                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
 6623                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
 6624 
 6625                         /*
 6626                          * Given that we're storing to thread-local data,
 6627                          * we need to flush our predicate cache.
 6628                          */
 6629                         curthread->t_predcache = 0;
 6630 
 6631                         if (dvar == NULL)
 6632                                 break;
 6633 
 6634                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6635                                 size_t lim;
 6636 
 6637                                 if (!dtrace_vcanload(
 6638                                     (void *)(uintptr_t)regs[rd],
 6639                                     &v->dtdv_type, &lim, mstate, vstate))
 6640                                         break;
 6641 
 6642                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
 6643                                     dvar->dtdv_data, &v->dtdv_type, lim);
 6644                         } else {
 6645                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
 6646                         }
 6647 
 6648                         break;
 6649                 }
 6650 
 6651                 case DIF_OP_SRA:
 6652                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
 6653                         break;
 6654 
 6655                 case DIF_OP_CALL:
 6656                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
 6657                             regs, tupregs, ttop, mstate, state);
 6658                         break;
 6659 
 6660                 case DIF_OP_PUSHTR:
 6661                         if (ttop == DIF_DTR_NREGS) {
 6662                                 *flags |= CPU_DTRACE_TUPOFLOW;
 6663                                 break;
 6664                         }
 6665 
 6666                         if (r1 == DIF_TYPE_STRING) {
 6667                                 /*
 6668                                  * If this is a string type and the size is 0,
 6669                                  * we'll use the system-wide default string
 6670                                  * size.  Note that we are _not_ looking at
 6671                                  * the value of the DTRACEOPT_STRSIZE option;
 6672                                  * had this been set, we would expect to have
 6673                                  * a non-zero size value in the "pushtr".
 6674                                  */
 6675                                 tupregs[ttop].dttk_size =
 6676                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
 6677                                     regs[r2] ? regs[r2] :
 6678                                     dtrace_strsize_default) + 1;
 6679                         } else {
 6680                                 if (regs[r2] > LONG_MAX) {
 6681                                         *flags |= CPU_DTRACE_ILLOP;
 6682                                         break;
 6683                                 }
 6684 
 6685                                 tupregs[ttop].dttk_size = regs[r2];
 6686                         }
 6687 
 6688                         tupregs[ttop++].dttk_value = regs[rd];
 6689                         break;
 6690 
 6691                 case DIF_OP_PUSHTV:
 6692                         if (ttop == DIF_DTR_NREGS) {
 6693                                 *flags |= CPU_DTRACE_TUPOFLOW;
 6694                                 break;
 6695                         }
 6696 
 6697                         tupregs[ttop].dttk_value = regs[rd];
 6698                         tupregs[ttop++].dttk_size = 0;
 6699                         break;
 6700 
 6701                 case DIF_OP_POPTS:
 6702                         if (ttop != 0)
 6703                                 ttop--;
 6704                         break;
 6705 
 6706                 case DIF_OP_FLUSHTS:
 6707                         ttop = 0;
 6708                         break;
 6709 
 6710                 case DIF_OP_LDGAA:
 6711                 case DIF_OP_LDTAA: {
 6712                         dtrace_dynvar_t *dvar;
 6713                         dtrace_key_t *key = tupregs;
 6714                         uint_t nkeys = ttop;
 6715 
 6716                         id = DIF_INSTR_VAR(instr);
 6717                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6718                         id -= DIF_VAR_OTHER_UBASE;
 6719 
 6720                         key[nkeys].dttk_value = (uint64_t)id;
 6721                         key[nkeys++].dttk_size = 0;
 6722 
 6723                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
 6724                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
 6725                                 key[nkeys++].dttk_size = 0;
 6726                                 VERIFY(id < vstate->dtvs_ntlocals);
 6727                                 v = &vstate->dtvs_tlocals[id];
 6728                         } else {
 6729                                 VERIFY(id < vstate->dtvs_nglobals);
 6730                                 v = &vstate->dtvs_globals[id]->dtsv_var;
 6731                         }
 6732 
 6733                         dvar = dtrace_dynvar(dstate, nkeys, key,
 6734                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
 6735                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
 6736                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
 6737 
 6738                         if (dvar == NULL) {
 6739                                 regs[rd] = 0;
 6740                                 break;
 6741                         }
 6742 
 6743                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6744                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
 6745                         } else {
 6746                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
 6747                         }
 6748 
 6749                         break;
 6750                 }
 6751 
 6752                 case DIF_OP_STGAA:
 6753                 case DIF_OP_STTAA: {
 6754                         dtrace_dynvar_t *dvar;
 6755                         dtrace_key_t *key = tupregs;
 6756                         uint_t nkeys = ttop;
 6757 
 6758                         id = DIF_INSTR_VAR(instr);
 6759                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
 6760                         id -= DIF_VAR_OTHER_UBASE;
 6761 
 6762                         key[nkeys].dttk_value = (uint64_t)id;
 6763                         key[nkeys++].dttk_size = 0;
 6764 
 6765                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
 6766                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
 6767                                 key[nkeys++].dttk_size = 0;
 6768                                 VERIFY(id < vstate->dtvs_ntlocals);
 6769                                 v = &vstate->dtvs_tlocals[id];
 6770                         } else {
 6771                                 VERIFY(id < vstate->dtvs_nglobals);
 6772                                 v = &vstate->dtvs_globals[id]->dtsv_var;
 6773                         }
 6774 
 6775                         dvar = dtrace_dynvar(dstate, nkeys, key,
 6776                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
 6777                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
 6778                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
 6779                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
 6780 
 6781                         if (dvar == NULL)
 6782                                 break;
 6783 
 6784                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
 6785                                 size_t lim;
 6786 
 6787                                 if (!dtrace_vcanload(
 6788                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
 6789                                     &lim, mstate, vstate))
 6790                                         break;
 6791 
 6792                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
 6793                                     dvar->dtdv_data, &v->dtdv_type, lim);
 6794                         } else {
 6795                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
 6796                         }
 6797 
 6798                         break;
 6799                 }
 6800 
 6801                 case DIF_OP_ALLOCS: {
 6802                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
 6803                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
 6804 
 6805                         /*
 6806                          * Rounding up the user allocation size could have
 6807                          * overflowed large, bogus allocations (like -1ULL) to
 6808                          * 0.
 6809                          */
 6810                         if (size < regs[r1] ||
 6811                             !DTRACE_INSCRATCH(mstate, size)) {
 6812                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 6813                                 regs[rd] = 0;
 6814                                 break;
 6815                         }
 6816 
 6817                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
 6818                         mstate->dtms_scratch_ptr += size;
 6819                         regs[rd] = ptr;
 6820                         break;
 6821                 }
 6822 
 6823                 case DIF_OP_COPYS:
 6824                         if (!dtrace_canstore(regs[rd], regs[r2],
 6825                             mstate, vstate)) {
 6826                                 *flags |= CPU_DTRACE_BADADDR;
 6827                                 *illval = regs[rd];
 6828                                 break;
 6829                         }
 6830 
 6831                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
 6832                                 break;
 6833 
 6834                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
 6835                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
 6836                         break;
 6837 
 6838                 case DIF_OP_STB:
 6839                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
 6840                                 *flags |= CPU_DTRACE_BADADDR;
 6841                                 *illval = regs[rd];
 6842                                 break;
 6843                         }
 6844                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
 6845                         break;
 6846 
 6847                 case DIF_OP_STH:
 6848                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
 6849                                 *flags |= CPU_DTRACE_BADADDR;
 6850                                 *illval = regs[rd];
 6851                                 break;
 6852                         }
 6853                         if (regs[rd] & 1) {
 6854                                 *flags |= CPU_DTRACE_BADALIGN;
 6855                                 *illval = regs[rd];
 6856                                 break;
 6857                         }
 6858                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
 6859                         break;
 6860 
 6861                 case DIF_OP_STW:
 6862                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
 6863                                 *flags |= CPU_DTRACE_BADADDR;
 6864                                 *illval = regs[rd];
 6865                                 break;
 6866                         }
 6867                         if (regs[rd] & 3) {
 6868                                 *flags |= CPU_DTRACE_BADALIGN;
 6869                                 *illval = regs[rd];
 6870                                 break;
 6871                         }
 6872                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
 6873                         break;
 6874 
 6875                 case DIF_OP_STX:
 6876                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
 6877                                 *flags |= CPU_DTRACE_BADADDR;
 6878                                 *illval = regs[rd];
 6879                                 break;
 6880                         }
 6881                         if (regs[rd] & 7) {
 6882                                 *flags |= CPU_DTRACE_BADALIGN;
 6883                                 *illval = regs[rd];
 6884                                 break;
 6885                         }
 6886                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
 6887                         break;
 6888                 }
 6889         }
 6890 
 6891         if (!(*flags & CPU_DTRACE_FAULT))
 6892                 return (rval);
 6893 
 6894         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
 6895         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
 6896 
 6897         return (0);
 6898 }
 6899 
 6900 static void
 6901 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
 6902 {
 6903         dtrace_probe_t *probe = ecb->dte_probe;
 6904         dtrace_provider_t *prov = probe->dtpr_provider;
 6905         char c[DTRACE_FULLNAMELEN + 80], *str;
 6906         char *msg = "dtrace: breakpoint action at probe ";
 6907         char *ecbmsg = " (ecb ";
 6908         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
 6909         uintptr_t val = (uintptr_t)ecb;
 6910         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
 6911 
 6912         if (dtrace_destructive_disallow)
 6913                 return;
 6914 
 6915         /*
 6916          * It's impossible to be taking action on the NULL probe.
 6917          */
 6918         ASSERT(probe != NULL);
 6919 
 6920         /*
 6921          * This is a poor man's (destitute man's?) sprintf():  we want to
 6922          * print the provider name, module name, function name and name of
 6923          * the probe, along with the hex address of the ECB with the breakpoint
 6924          * action -- all of which we must place in the character buffer by
 6925          * hand.
 6926          */
 6927         while (*msg != '\0')
 6928                 c[i++] = *msg++;
 6929 
 6930         for (str = prov->dtpv_name; *str != '\0'; str++)
 6931                 c[i++] = *str;
 6932         c[i++] = ':';
 6933 
 6934         for (str = probe->dtpr_mod; *str != '\0'; str++)
 6935                 c[i++] = *str;
 6936         c[i++] = ':';
 6937 
 6938         for (str = probe->dtpr_func; *str != '\0'; str++)
 6939                 c[i++] = *str;
 6940         c[i++] = ':';
 6941 
 6942         for (str = probe->dtpr_name; *str != '\0'; str++)
 6943                 c[i++] = *str;
 6944 
 6945         while (*ecbmsg != '\0')
 6946                 c[i++] = *ecbmsg++;
 6947 
 6948         while (shift >= 0) {
 6949                 mask = (uintptr_t)0xf << shift;
 6950 
 6951                 if (val >= ((uintptr_t)1 << shift))
 6952                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
 6953                 shift -= 4;
 6954         }
 6955 
 6956         c[i++] = ')';
 6957         c[i] = '\0';
 6958 
 6959 #ifdef illumos
 6960         debug_enter(c);
 6961 #else
 6962         kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
 6963 #endif
 6964 }
 6965 
 6966 static void
 6967 dtrace_action_panic(dtrace_ecb_t *ecb)
 6968 {
 6969         dtrace_probe_t *probe = ecb->dte_probe;
 6970 
 6971         /*
 6972          * It's impossible to be taking action on the NULL probe.
 6973          */
 6974         ASSERT(probe != NULL);
 6975 
 6976         if (dtrace_destructive_disallow)
 6977                 return;
 6978 
 6979         if (dtrace_panicked != NULL)
 6980                 return;
 6981 
 6982         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
 6983                 return;
 6984 
 6985         /*
 6986          * We won the right to panic.  (We want to be sure that only one
 6987          * thread calls panic() from dtrace_probe(), and that panic() is
 6988          * called exactly once.)
 6989          */
 6990         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
 6991             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
 6992             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
 6993 }
 6994 
 6995 static void
 6996 dtrace_action_raise(uint64_t sig)
 6997 {
 6998         if (dtrace_destructive_disallow)
 6999                 return;
 7000 
 7001         if (sig >= NSIG) {
 7002                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
 7003                 return;
 7004         }
 7005 
 7006 #ifdef illumos
 7007         /*
 7008          * raise() has a queue depth of 1 -- we ignore all subsequent
 7009          * invocations of the raise() action.
 7010          */
 7011         if (curthread->t_dtrace_sig == 0)
 7012                 curthread->t_dtrace_sig = (uint8_t)sig;
 7013 
 7014         curthread->t_sig_check = 1;
 7015         aston(curthread);
 7016 #else
 7017         struct proc *p = curproc;
 7018         PROC_LOCK(p);
 7019         kern_psignal(p, sig);
 7020         PROC_UNLOCK(p);
 7021 #endif
 7022 }
 7023 
 7024 static void
 7025 dtrace_action_stop(void)
 7026 {
 7027         if (dtrace_destructive_disallow)
 7028                 return;
 7029 
 7030 #ifdef illumos
 7031         if (!curthread->t_dtrace_stop) {
 7032                 curthread->t_dtrace_stop = 1;
 7033                 curthread->t_sig_check = 1;
 7034                 aston(curthread);
 7035         }
 7036 #else
 7037         struct proc *p = curproc;
 7038         PROC_LOCK(p);
 7039         kern_psignal(p, SIGSTOP);
 7040         PROC_UNLOCK(p);
 7041 #endif
 7042 }
 7043 
 7044 static void
 7045 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
 7046 {
 7047         hrtime_t now;
 7048         volatile uint16_t *flags;
 7049 #ifdef illumos
 7050         cpu_t *cpu = CPU;
 7051 #else
 7052         cpu_t *cpu = &solaris_cpu[curcpu];
 7053 #endif
 7054 
 7055         if (dtrace_destructive_disallow)
 7056                 return;
 7057 
 7058         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
 7059 
 7060         now = dtrace_gethrtime();
 7061 
 7062         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
 7063                 /*
 7064                  * We need to advance the mark to the current time.
 7065                  */
 7066                 cpu->cpu_dtrace_chillmark = now;
 7067                 cpu->cpu_dtrace_chilled = 0;
 7068         }
 7069 
 7070         /*
 7071          * Now check to see if the requested chill time would take us over
 7072          * the maximum amount of time allowed in the chill interval.  (Or
 7073          * worse, if the calculation itself induces overflow.)
 7074          */
 7075         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
 7076             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
 7077                 *flags |= CPU_DTRACE_ILLOP;
 7078                 return;
 7079         }
 7080 
 7081         while (dtrace_gethrtime() - now < val)
 7082                 continue;
 7083 
 7084         /*
 7085          * Normally, we assure that the value of the variable "timestamp" does
 7086          * not change within an ECB.  The presence of chill() represents an
 7087          * exception to this rule, however.
 7088          */
 7089         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
 7090         cpu->cpu_dtrace_chilled += val;
 7091 }
 7092 
 7093 static void
 7094 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
 7095     uint64_t *buf, uint64_t arg)
 7096 {
 7097         int nframes = DTRACE_USTACK_NFRAMES(arg);
 7098         int strsize = DTRACE_USTACK_STRSIZE(arg);
 7099         uint64_t *pcs = &buf[1], *fps;
 7100         char *str = (char *)&pcs[nframes];
 7101         int size, offs = 0, i, j;
 7102         size_t rem;
 7103         uintptr_t old = mstate->dtms_scratch_ptr, saved;
 7104         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
 7105         char *sym;
 7106 
 7107         /*
 7108          * Should be taking a faster path if string space has not been
 7109          * allocated.
 7110          */
 7111         ASSERT(strsize != 0);
 7112 
 7113         /*
 7114          * We will first allocate some temporary space for the frame pointers.
 7115          */
 7116         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
 7117         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
 7118             (nframes * sizeof (uint64_t));
 7119 
 7120         if (!DTRACE_INSCRATCH(mstate, size)) {
 7121                 /*
 7122                  * Not enough room for our frame pointers -- need to indicate
 7123                  * that we ran out of scratch space.
 7124                  */
 7125                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
 7126                 return;
 7127         }
 7128 
 7129         mstate->dtms_scratch_ptr += size;
 7130         saved = mstate->dtms_scratch_ptr;
 7131 
 7132         /*
 7133          * Now get a stack with both program counters and frame pointers.
 7134          */
 7135         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 7136         dtrace_getufpstack(buf, fps, nframes + 1);
 7137         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 7138 
 7139         /*
 7140          * If that faulted, we're cooked.
 7141          */
 7142         if (*flags & CPU_DTRACE_FAULT)
 7143                 goto out;
 7144 
 7145         /*
 7146          * Now we want to walk up the stack, calling the USTACK helper.  For
 7147          * each iteration, we restore the scratch pointer.
 7148          */
 7149         for (i = 0; i < nframes; i++) {
 7150                 mstate->dtms_scratch_ptr = saved;
 7151 
 7152                 if (offs >= strsize)
 7153                         break;
 7154 
 7155                 sym = (char *)(uintptr_t)dtrace_helper(
 7156                     DTRACE_HELPER_ACTION_USTACK,
 7157                     mstate, state, pcs[i], fps[i]);
 7158 
 7159                 /*
 7160                  * If we faulted while running the helper, we're going to
 7161                  * clear the fault and null out the corresponding string.
 7162                  */
 7163                 if (*flags & CPU_DTRACE_FAULT) {
 7164                         *flags &= ~CPU_DTRACE_FAULT;
 7165                         str[offs++] = '\0';
 7166                         continue;
 7167                 }
 7168 
 7169                 if (sym == NULL) {
 7170                         str[offs++] = '\0';
 7171                         continue;
 7172                 }
 7173 
 7174                 if (!dtrace_strcanload((uintptr_t)sym, strsize, &rem, mstate,
 7175                     &(state->dts_vstate))) {
 7176                         str[offs++] = '\0';
 7177                         continue;
 7178                 }
 7179 
 7180                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 7181 
 7182                 /*
 7183                  * Now copy in the string that the helper returned to us.
 7184                  */
 7185                 for (j = 0; offs + j < strsize && j < rem; j++) {
 7186                         if ((str[offs + j] = sym[j]) == '\0')
 7187                                 break;
 7188                 }
 7189 
 7190                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 7191 
 7192                 offs += j + 1;
 7193         }
 7194 
 7195         if (offs >= strsize) {
 7196                 /*
 7197                  * If we didn't have room for all of the strings, we don't
 7198                  * abort processing -- this needn't be a fatal error -- but we
 7199                  * still want to increment a counter (dts_stkstroverflows) to
 7200                  * allow this condition to be warned about.  (If this is from
 7201                  * a jstack() action, it is easily tuned via jstackstrsize.)
 7202                  */
 7203                 dtrace_error(&state->dts_stkstroverflows);
 7204         }
 7205 
 7206         while (offs < strsize)
 7207                 str[offs++] = '\0';
 7208 
 7209 out:
 7210         mstate->dtms_scratch_ptr = old;
 7211 }
 7212 
 7213 static void
 7214 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
 7215     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
 7216 {
 7217         volatile uint16_t *flags;
 7218         uint64_t val = *valp;
 7219         size_t valoffs = *valoffsp;
 7220 
 7221         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
 7222         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
 7223 
 7224         /*
 7225          * If this is a string, we're going to only load until we find the zero
 7226          * byte -- after which we'll store zero bytes.
 7227          */
 7228         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
 7229                 char c = '\0' + 1;
 7230                 size_t s;
 7231 
 7232                 for (s = 0; s < size; s++) {
 7233                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
 7234                                 c = dtrace_load8(val++);
 7235                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
 7236                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 7237                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
 7238                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 7239                                 if (*flags & CPU_DTRACE_FAULT)
 7240                                         break;
 7241                         }
 7242 
 7243                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
 7244 
 7245                         if (c == '\0' && intuple)
 7246                                 break;
 7247                 }
 7248         } else {
 7249                 uint8_t c;
 7250                 while (valoffs < end) {
 7251                         if (dtkind == DIF_TF_BYREF) {
 7252                                 c = dtrace_load8(val++);
 7253                         } else if (dtkind == DIF_TF_BYUREF) {
 7254                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 7255                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
 7256                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 7257                                 if (*flags & CPU_DTRACE_FAULT)
 7258                                         break;
 7259                         }
 7260 
 7261                         DTRACE_STORE(uint8_t, tomax,
 7262                             valoffs++, c);
 7263                 }
 7264         }
 7265 
 7266         *valp = val;
 7267         *valoffsp = valoffs;
 7268 }
 7269 
 7270 /*
 7271  * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
 7272  * defined, we also assert that we are not recursing unless the probe ID is an
 7273  * error probe.
 7274  */
 7275 static dtrace_icookie_t
 7276 dtrace_probe_enter(dtrace_id_t id)
 7277 {
 7278         dtrace_icookie_t cookie;
 7279 
 7280         cookie = dtrace_interrupt_disable();
 7281 
 7282         /*
 7283          * Unless this is an ERROR probe, we are not allowed to recurse in
 7284          * dtrace_probe(). Recursing into DTrace probe usually means that a
 7285          * function is instrumented that should not have been instrumented or
 7286          * that the ordering guarantee of the records will be violated,
 7287          * resulting in unexpected output. If there is an exception to this
 7288          * assertion, a new case should be added.
 7289          */
 7290         ASSERT(curthread->t_dtrace_inprobe == 0 ||
 7291             id == dtrace_probeid_error);
 7292         curthread->t_dtrace_inprobe = 1;
 7293 
 7294         return (cookie);
 7295 }
 7296 
 7297 /*
 7298  * Clears the per-thread inprobe flag and enables interrupts.
 7299  */
 7300 static void
 7301 dtrace_probe_exit(dtrace_icookie_t cookie)
 7302 {
 7303 
 7304         curthread->t_dtrace_inprobe = 0;
 7305         dtrace_interrupt_enable(cookie);
 7306 }
 7307 
 7308 /*
 7309  * If you're looking for the epicenter of DTrace, you just found it.  This
 7310  * is the function called by the provider to fire a probe -- from which all
 7311  * subsequent probe-context DTrace activity emanates.
 7312  */
 7313 void
 7314 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
 7315     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
 7316 {
 7317         processorid_t cpuid;
 7318         dtrace_icookie_t cookie;
 7319         dtrace_probe_t *probe;
 7320         dtrace_mstate_t mstate;
 7321         dtrace_ecb_t *ecb;
 7322         dtrace_action_t *act;
 7323         intptr_t offs;
 7324         size_t size;
 7325         int vtime, onintr;
 7326         volatile uint16_t *flags;
 7327         hrtime_t now;
 7328 
 7329         if (KERNEL_PANICKED())
 7330                 return;
 7331 
 7332 #ifdef illumos
 7333         /*
 7334          * Kick out immediately if this CPU is still being born (in which case
 7335          * curthread will be set to -1) or the current thread can't allow
 7336          * probes in its current context.
 7337          */
 7338         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
 7339                 return;
 7340 #endif
 7341 
 7342         cookie = dtrace_probe_enter(id);
 7343         probe = dtrace_probes[id - 1];
 7344         cpuid = curcpu;
 7345         onintr = CPU_ON_INTR(CPU);
 7346 
 7347         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
 7348             probe->dtpr_predcache == curthread->t_predcache) {
 7349                 /*
 7350                  * We have hit in the predicate cache; we know that
 7351                  * this predicate would evaluate to be false.
 7352                  */
 7353                 dtrace_probe_exit(cookie);
 7354                 return;
 7355         }
 7356 
 7357 #ifdef illumos
 7358         if (panic_quiesce) {
 7359 #else
 7360         if (KERNEL_PANICKED()) {
 7361 #endif
 7362                 /*
 7363                  * We don't trace anything if we're panicking.
 7364                  */
 7365                 dtrace_probe_exit(cookie);
 7366                 return;
 7367         }
 7368 
 7369         now = mstate.dtms_timestamp = dtrace_gethrtime();
 7370         mstate.dtms_present = DTRACE_MSTATE_TIMESTAMP;
 7371         vtime = dtrace_vtime_references != 0;
 7372 
 7373         if (vtime && curthread->t_dtrace_start)
 7374                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
 7375 
 7376         mstate.dtms_difo = NULL;
 7377         mstate.dtms_probe = probe;
 7378         mstate.dtms_strtok = 0;
 7379         mstate.dtms_arg[0] = arg0;
 7380         mstate.dtms_arg[1] = arg1;
 7381         mstate.dtms_arg[2] = arg2;
 7382         mstate.dtms_arg[3] = arg3;
 7383         mstate.dtms_arg[4] = arg4;
 7384 
 7385         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
 7386 
 7387         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
 7388                 dtrace_predicate_t *pred = ecb->dte_predicate;
 7389                 dtrace_state_t *state = ecb->dte_state;
 7390                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
 7391                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
 7392                 dtrace_vstate_t *vstate = &state->dts_vstate;
 7393                 dtrace_provider_t *prov = probe->dtpr_provider;
 7394                 uint64_t tracememsize = 0;
 7395                 int committed = 0;
 7396                 caddr_t tomax;
 7397 
 7398                 /*
 7399                  * A little subtlety with the following (seemingly innocuous)
 7400                  * declaration of the automatic 'val':  by looking at the
 7401                  * code, you might think that it could be declared in the
 7402                  * action processing loop, below.  (That is, it's only used in
 7403                  * the action processing loop.)  However, it must be declared
 7404                  * out of that scope because in the case of DIF expression
 7405                  * arguments to aggregating actions, one iteration of the
 7406                  * action loop will use the last iteration's value.
 7407                  */
 7408                 uint64_t val = 0;
 7409 
 7410                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
 7411                 mstate.dtms_getf = NULL;
 7412 
 7413                 *flags &= ~CPU_DTRACE_ERROR;
 7414 
 7415                 if (prov == dtrace_provider) {
 7416                         /*
 7417                          * If dtrace itself is the provider of this probe,
 7418                          * we're only going to continue processing the ECB if
 7419                          * arg0 (the dtrace_state_t) is equal to the ECB's
 7420                          * creating state.  (This prevents disjoint consumers
 7421                          * from seeing one another's metaprobes.)
 7422                          */
 7423                         if (arg0 != (uint64_t)(uintptr_t)state)
 7424                                 continue;
 7425                 }
 7426 
 7427                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
 7428                         /*
 7429                          * We're not currently active.  If our provider isn't
 7430                          * the dtrace pseudo provider, we're not interested.
 7431                          */
 7432                         if (prov != dtrace_provider)
 7433                                 continue;
 7434 
 7435                         /*
 7436                          * Now we must further check if we are in the BEGIN
 7437                          * probe.  If we are, we will only continue processing
 7438                          * if we're still in WARMUP -- if one BEGIN enabling
 7439                          * has invoked the exit() action, we don't want to
 7440                          * evaluate subsequent BEGIN enablings.
 7441                          */
 7442                         if (probe->dtpr_id == dtrace_probeid_begin &&
 7443                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
 7444                                 ASSERT(state->dts_activity ==
 7445                                     DTRACE_ACTIVITY_DRAINING);
 7446                                 continue;
 7447                         }
 7448                 }
 7449 
 7450                 if (ecb->dte_cond) {
 7451                         /*
 7452                          * If the dte_cond bits indicate that this
 7453                          * consumer is only allowed to see user-mode firings
 7454                          * of this probe, call the provider's dtps_usermode()
 7455                          * entry point to check that the probe was fired
 7456                          * while in a user context. Skip this ECB if that's
 7457                          * not the case.
 7458                          */
 7459                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
 7460                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
 7461                             probe->dtpr_id, probe->dtpr_arg) == 0)
 7462                                 continue;
 7463 
 7464 #ifdef illumos
 7465                         /*
 7466                          * This is more subtle than it looks. We have to be
 7467                          * absolutely certain that CRED() isn't going to
 7468                          * change out from under us so it's only legit to
 7469                          * examine that structure if we're in constrained
 7470                          * situations. Currently, the only times we'll this
 7471                          * check is if a non-super-user has enabled the
 7472                          * profile or syscall providers -- providers that
 7473                          * allow visibility of all processes. For the
 7474                          * profile case, the check above will ensure that
 7475                          * we're examining a user context.
 7476                          */
 7477                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
 7478                                 cred_t *cr;
 7479                                 cred_t *s_cr =
 7480                                     ecb->dte_state->dts_cred.dcr_cred;
 7481                                 proc_t *proc;
 7482 
 7483                                 ASSERT(s_cr != NULL);
 7484 
 7485                                 if ((cr = CRED()) == NULL ||
 7486                                     s_cr->cr_uid != cr->cr_uid ||
 7487                                     s_cr->cr_uid != cr->cr_ruid ||
 7488                                     s_cr->cr_uid != cr->cr_suid ||
 7489                                     s_cr->cr_gid != cr->cr_gid ||
 7490                                     s_cr->cr_gid != cr->cr_rgid ||
 7491                                     s_cr->cr_gid != cr->cr_sgid ||
 7492                                     (proc = ttoproc(curthread)) == NULL ||
 7493                                     (proc->p_flag & SNOCD))
 7494                                         continue;
 7495                         }
 7496 
 7497                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
 7498                                 cred_t *cr;
 7499                                 cred_t *s_cr =
 7500                                     ecb->dte_state->dts_cred.dcr_cred;
 7501 
 7502                                 ASSERT(s_cr != NULL);
 7503 
 7504                                 if ((cr = CRED()) == NULL ||
 7505                                     s_cr->cr_zone->zone_id !=
 7506                                     cr->cr_zone->zone_id)
 7507                                         continue;
 7508                         }
 7509 #endif
 7510                 }
 7511 
 7512                 if (now - state->dts_alive > dtrace_deadman_timeout) {
 7513                         /*
 7514                          * We seem to be dead.  Unless we (a) have kernel
 7515                          * destructive permissions (b) have explicitly enabled
 7516                          * destructive actions and (c) destructive actions have
 7517                          * not been disabled, we're going to transition into
 7518                          * the KILLED state, from which no further processing
 7519                          * on this state will be performed.
 7520                          */
 7521                         if (!dtrace_priv_kernel_destructive(state) ||
 7522                             !state->dts_cred.dcr_destructive ||
 7523                             dtrace_destructive_disallow) {
 7524                                 void *activity = &state->dts_activity;
 7525                                 dtrace_activity_t curstate;
 7526 
 7527                                 do {
 7528                                         curstate = state->dts_activity;
 7529                                 } while (dtrace_cas32(activity, curstate,
 7530                                     DTRACE_ACTIVITY_KILLED) != curstate);
 7531 
 7532                                 continue;
 7533                         }
 7534                 }
 7535 
 7536                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
 7537                     ecb->dte_alignment, state, &mstate)) < 0)
 7538                         continue;
 7539 
 7540                 tomax = buf->dtb_tomax;
 7541                 ASSERT(tomax != NULL);
 7542 
 7543                 if (ecb->dte_size != 0) {
 7544                         dtrace_rechdr_t dtrh;
 7545                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
 7546                                 mstate.dtms_timestamp = dtrace_gethrtime();
 7547                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
 7548                         }
 7549                         ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
 7550                         dtrh.dtrh_epid = ecb->dte_epid;
 7551                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
 7552                             mstate.dtms_timestamp);
 7553                         *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
 7554                 }
 7555 
 7556                 mstate.dtms_epid = ecb->dte_epid;
 7557                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
 7558 
 7559                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
 7560                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
 7561                 else
 7562                         mstate.dtms_access = 0;
 7563 
 7564                 if (pred != NULL) {
 7565                         dtrace_difo_t *dp = pred->dtp_difo;
 7566                         uint64_t rval;
 7567 
 7568                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
 7569 
 7570                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
 7571                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
 7572 
 7573                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
 7574                                         /*
 7575                                          * Update the predicate cache...
 7576                                          */
 7577                                         ASSERT(cid == pred->dtp_cacheid);
 7578                                         curthread->t_predcache = cid;
 7579                                 }
 7580 
 7581                                 continue;
 7582                         }
 7583                 }
 7584 
 7585                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
 7586                     act != NULL; act = act->dta_next) {
 7587                         size_t valoffs;
 7588                         dtrace_difo_t *dp;
 7589                         dtrace_recdesc_t *rec = &act->dta_rec;
 7590 
 7591                         size = rec->dtrd_size;
 7592                         valoffs = offs + rec->dtrd_offset;
 7593 
 7594                         if (DTRACEACT_ISAGG(act->dta_kind)) {
 7595                                 uint64_t v = 0xbad;
 7596                                 dtrace_aggregation_t *agg;
 7597 
 7598                                 agg = (dtrace_aggregation_t *)act;
 7599 
 7600                                 if ((dp = act->dta_difo) != NULL)
 7601                                         v = dtrace_dif_emulate(dp,
 7602                                             &mstate, vstate, state);
 7603 
 7604                                 if (*flags & CPU_DTRACE_ERROR)
 7605                                         continue;
 7606 
 7607                                 /*
 7608                                  * Note that we always pass the expression
 7609                                  * value from the previous iteration of the
 7610                                  * action loop.  This value will only be used
 7611                                  * if there is an expression argument to the
 7612                                  * aggregating action, denoted by the
 7613                                  * dtag_hasarg field.
 7614                                  */
 7615                                 dtrace_aggregate(agg, buf,
 7616                                     offs, aggbuf, v, val);
 7617                                 continue;
 7618                         }
 7619 
 7620                         switch (act->dta_kind) {
 7621                         case DTRACEACT_STOP:
 7622                                 if (dtrace_priv_proc_destructive(state))
 7623                                         dtrace_action_stop();
 7624                                 continue;
 7625 
 7626                         case DTRACEACT_BREAKPOINT:
 7627                                 if (dtrace_priv_kernel_destructive(state))
 7628                                         dtrace_action_breakpoint(ecb);
 7629                                 continue;
 7630 
 7631                         case DTRACEACT_PANIC:
 7632                                 if (dtrace_priv_kernel_destructive(state))
 7633                                         dtrace_action_panic(ecb);
 7634                                 continue;
 7635 
 7636                         case DTRACEACT_STACK:
 7637                                 if (!dtrace_priv_kernel(state))
 7638                                         continue;
 7639 
 7640                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
 7641                                     size / sizeof (pc_t), probe->dtpr_aframes,
 7642                                     DTRACE_ANCHORED(probe) ? NULL :
 7643                                     (uint32_t *)arg0);
 7644                                 continue;
 7645 
 7646                         case DTRACEACT_JSTACK:
 7647                         case DTRACEACT_USTACK:
 7648                                 if (!dtrace_priv_proc(state))
 7649                                         continue;
 7650 
 7651                                 /*
 7652                                  * See comment in DIF_VAR_PID.
 7653                                  */
 7654                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
 7655                                     CPU_ON_INTR(CPU)) {
 7656                                         int depth = DTRACE_USTACK_NFRAMES(
 7657                                             rec->dtrd_arg) + 1;
 7658 
 7659                                         dtrace_bzero((void *)(tomax + valoffs),
 7660                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
 7661                                             + depth * sizeof (uint64_t));
 7662 
 7663                                         continue;
 7664                                 }
 7665 
 7666                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
 7667                                     curproc->p_dtrace_helpers != NULL) {
 7668                                         /*
 7669                                          * This is the slow path -- we have
 7670                                          * allocated string space, and we're
 7671                                          * getting the stack of a process that
 7672                                          * has helpers.  Call into a separate
 7673                                          * routine to perform this processing.
 7674                                          */
 7675                                         dtrace_action_ustack(&mstate, state,
 7676                                             (uint64_t *)(tomax + valoffs),
 7677                                             rec->dtrd_arg);
 7678                                         continue;
 7679                                 }
 7680 
 7681                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 7682                                 dtrace_getupcstack((uint64_t *)
 7683                                     (tomax + valoffs),
 7684                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
 7685                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
 7686                                 continue;
 7687 
 7688                         default:
 7689                                 break;
 7690                         }
 7691 
 7692                         dp = act->dta_difo;
 7693                         ASSERT(dp != NULL);
 7694 
 7695                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
 7696 
 7697                         if (*flags & CPU_DTRACE_ERROR)
 7698                                 continue;
 7699 
 7700                         switch (act->dta_kind) {
 7701                         case DTRACEACT_SPECULATE: {
 7702                                 dtrace_rechdr_t *dtrh;
 7703 
 7704                                 ASSERT(buf == &state->dts_buffer[cpuid]);
 7705                                 buf = dtrace_speculation_buffer(state,
 7706                                     cpuid, val);
 7707 
 7708                                 if (buf == NULL) {
 7709                                         *flags |= CPU_DTRACE_DROP;
 7710                                         continue;
 7711                                 }
 7712 
 7713                                 offs = dtrace_buffer_reserve(buf,
 7714                                     ecb->dte_needed, ecb->dte_alignment,
 7715                                     state, NULL);
 7716 
 7717                                 if (offs < 0) {
 7718                                         *flags |= CPU_DTRACE_DROP;
 7719                                         continue;
 7720                                 }
 7721 
 7722                                 tomax = buf->dtb_tomax;
 7723                                 ASSERT(tomax != NULL);
 7724 
 7725                                 if (ecb->dte_size == 0)
 7726                                         continue;
 7727 
 7728                                 ASSERT3U(ecb->dte_size, >=,
 7729                                     sizeof (dtrace_rechdr_t));
 7730                                 dtrh = ((void *)(tomax + offs));
 7731                                 dtrh->dtrh_epid = ecb->dte_epid;
 7732                                 /*
 7733                                  * When the speculation is committed, all of
 7734                                  * the records in the speculative buffer will
 7735                                  * have their timestamps set to the commit
 7736                                  * time.  Until then, it is set to a sentinel
 7737                                  * value, for debugability.
 7738                                  */
 7739                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
 7740                                 continue;
 7741                         }
 7742 
 7743                         case DTRACEACT_PRINTM: {
 7744                                 /* The DIF returns a 'memref'. */
 7745                                 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
 7746 
 7747                                 /* Get the size from the memref. */
 7748                                 size = memref[1];
 7749 
 7750                                 /*
 7751                                  * Check if the size exceeds the allocated
 7752                                  * buffer size.
 7753                                  */
 7754                                 if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
 7755                                         /* Flag a drop! */
 7756                                         *flags |= CPU_DTRACE_DROP;
 7757                                         continue;
 7758                                 }
 7759 
 7760                                 /* Store the size in the buffer first. */
 7761                                 DTRACE_STORE(uintptr_t, tomax,
 7762                                     valoffs, size);
 7763 
 7764                                 /*
 7765                                  * Offset the buffer address to the start
 7766                                  * of the data.
 7767                                  */
 7768                                 valoffs += sizeof(uintptr_t);
 7769 
 7770                                 /*
 7771                                  * Reset to the memory address rather than
 7772                                  * the memref array, then let the BYREF
 7773                                  * code below do the work to store the 
 7774                                  * memory data in the buffer.
 7775                                  */
 7776                                 val = memref[0];
 7777                                 break;
 7778                         }
 7779 
 7780                         case DTRACEACT_CHILL:
 7781                                 if (dtrace_priv_kernel_destructive(state))
 7782                                         dtrace_action_chill(&mstate, val);
 7783                                 continue;
 7784 
 7785                         case DTRACEACT_RAISE:
 7786                                 if (dtrace_priv_proc_destructive(state))
 7787                                         dtrace_action_raise(val);
 7788                                 continue;
 7789 
 7790                         case DTRACEACT_COMMIT:
 7791                                 ASSERT(!committed);
 7792 
 7793                                 /*
 7794                                  * We need to commit our buffer state.
 7795                                  */
 7796                                 if (ecb->dte_size)
 7797                                         buf->dtb_offset = offs + ecb->dte_size;
 7798                                 buf = &state->dts_buffer[cpuid];
 7799                                 dtrace_speculation_commit(state, cpuid, val);
 7800                                 committed = 1;
 7801                                 continue;
 7802 
 7803                         case DTRACEACT_DISCARD:
 7804                                 dtrace_speculation_discard(state, cpuid, val);
 7805                                 continue;
 7806 
 7807                         case DTRACEACT_DIFEXPR:
 7808                         case DTRACEACT_LIBACT:
 7809                         case DTRACEACT_PRINTF:
 7810                         case DTRACEACT_PRINTA:
 7811                         case DTRACEACT_SYSTEM:
 7812                         case DTRACEACT_FREOPEN:
 7813                         case DTRACEACT_TRACEMEM:
 7814                                 break;
 7815 
 7816                         case DTRACEACT_TRACEMEM_DYNSIZE:
 7817                                 tracememsize = val;
 7818                                 break;
 7819 
 7820                         case DTRACEACT_SYM:
 7821                         case DTRACEACT_MOD:
 7822                                 if (!dtrace_priv_kernel(state))
 7823                                         continue;
 7824                                 break;
 7825 
 7826                         case DTRACEACT_USYM:
 7827                         case DTRACEACT_UMOD:
 7828                         case DTRACEACT_UADDR: {
 7829 #ifdef illumos
 7830                                 struct pid *pid = curthread->t_procp->p_pidp;
 7831 #endif
 7832 
 7833                                 if (!dtrace_priv_proc(state))
 7834                                         continue;
 7835 
 7836                                 DTRACE_STORE(uint64_t, tomax,
 7837 #ifdef illumos
 7838                                     valoffs, (uint64_t)pid->pid_id);
 7839 #else
 7840                                     valoffs, (uint64_t) curproc->p_pid);
 7841 #endif
 7842                                 DTRACE_STORE(uint64_t, tomax,
 7843                                     valoffs + sizeof (uint64_t), val);
 7844 
 7845                                 continue;
 7846                         }
 7847 
 7848                         case DTRACEACT_EXIT: {
 7849                                 /*
 7850                                  * For the exit action, we are going to attempt
 7851                                  * to atomically set our activity to be
 7852                                  * draining.  If this fails (either because
 7853                                  * another CPU has beat us to the exit action,
 7854                                  * or because our current activity is something
 7855                                  * other than ACTIVE or WARMUP), we will
 7856                                  * continue.  This assures that the exit action
 7857                                  * can be successfully recorded at most once
 7858                                  * when we're in the ACTIVE state.  If we're
 7859                                  * encountering the exit() action while in
 7860                                  * COOLDOWN, however, we want to honor the new
 7861                                  * status code.  (We know that we're the only
 7862                                  * thread in COOLDOWN, so there is no race.)
 7863                                  */
 7864                                 void *activity = &state->dts_activity;
 7865                                 dtrace_activity_t curstate = state->dts_activity;
 7866 
 7867                                 if (curstate == DTRACE_ACTIVITY_COOLDOWN)
 7868                                         break;
 7869 
 7870                                 if (curstate != DTRACE_ACTIVITY_WARMUP)
 7871                                         curstate = DTRACE_ACTIVITY_ACTIVE;
 7872 
 7873                                 if (dtrace_cas32(activity, curstate,
 7874                                     DTRACE_ACTIVITY_DRAINING) != curstate) {
 7875                                         *flags |= CPU_DTRACE_DROP;
 7876                                         continue;
 7877                                 }
 7878 
 7879                                 break;
 7880                         }
 7881 
 7882                         default:
 7883                                 ASSERT(0);
 7884                         }
 7885 
 7886                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
 7887                             dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
 7888                                 uintptr_t end = valoffs + size;
 7889 
 7890                                 if (tracememsize != 0 &&
 7891                                     valoffs + tracememsize < end) {
 7892                                         end = valoffs + tracememsize;
 7893                                         tracememsize = 0;
 7894                                 }
 7895 
 7896                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
 7897                                     !dtrace_vcanload((void *)(uintptr_t)val,
 7898                                     &dp->dtdo_rtype, NULL, &mstate, vstate))
 7899                                         continue;
 7900 
 7901                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
 7902                                     &val, end, act->dta_intuple,
 7903                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
 7904                                     DIF_TF_BYREF: DIF_TF_BYUREF);
 7905                                 continue;
 7906                         }
 7907 
 7908                         switch (size) {
 7909                         case 0:
 7910                                 break;
 7911 
 7912                         case sizeof (uint8_t):
 7913                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
 7914                                 break;
 7915                         case sizeof (uint16_t):
 7916                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
 7917                                 break;
 7918                         case sizeof (uint32_t):
 7919                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
 7920                                 break;
 7921                         case sizeof (uint64_t):
 7922                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
 7923                                 break;
 7924                         default:
 7925                                 /*
 7926                                  * Any other size should have been returned by
 7927                                  * reference, not by value.
 7928                                  */
 7929                                 ASSERT(0);
 7930                                 break;
 7931                         }
 7932                 }
 7933 
 7934                 if (*flags & CPU_DTRACE_DROP)
 7935                         continue;
 7936 
 7937                 if (*flags & CPU_DTRACE_FAULT) {
 7938                         int ndx;
 7939                         dtrace_action_t *err;
 7940 
 7941                         buf->dtb_errors++;
 7942 
 7943                         if (probe->dtpr_id == dtrace_probeid_error) {
 7944                                 /*
 7945                                  * There's nothing we can do -- we had an
 7946                                  * error on the error probe.  We bump an
 7947                                  * error counter to at least indicate that
 7948                                  * this condition happened.
 7949                                  */
 7950                                 dtrace_error(&state->dts_dblerrors);
 7951                                 continue;
 7952                         }
 7953 
 7954                         if (vtime) {
 7955                                 /*
 7956                                  * Before recursing on dtrace_probe(), we
 7957                                  * need to explicitly clear out our start
 7958                                  * time to prevent it from being accumulated
 7959                                  * into t_dtrace_vtime.
 7960                                  */
 7961                                 curthread->t_dtrace_start = 0;
 7962                         }
 7963 
 7964                         /*
 7965                          * Iterate over the actions to figure out which action
 7966                          * we were processing when we experienced the error.
 7967                          * Note that act points _past_ the faulting action; if
 7968                          * act is ecb->dte_action, the fault was in the
 7969                          * predicate, if it's ecb->dte_action->dta_next it's
 7970                          * in action #1, and so on.
 7971                          */
 7972                         for (err = ecb->dte_action, ndx = 0;
 7973                             err != act; err = err->dta_next, ndx++)
 7974                                 continue;
 7975 
 7976                         dtrace_probe_error(state, ecb->dte_epid, ndx,
 7977                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
 7978                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
 7979                             cpu_core[cpuid].cpuc_dtrace_illval);
 7980 
 7981                         continue;
 7982                 }
 7983 
 7984                 if (!committed)
 7985                         buf->dtb_offset = offs + ecb->dte_size;
 7986         }
 7987 
 7988         if (vtime)
 7989                 curthread->t_dtrace_start = dtrace_gethrtime();
 7990 
 7991         dtrace_probe_exit(cookie);
 7992 }
 7993 
 7994 /*
 7995  * DTrace Probe Hashing Functions
 7996  *
 7997  * The functions in this section (and indeed, the functions in remaining
 7998  * sections) are not _called_ from probe context.  (Any exceptions to this are
 7999  * marked with a "Note:".)  Rather, they are called from elsewhere in the
 8000  * DTrace framework to look-up probes in, add probes to and remove probes from
 8001  * the DTrace probe hashes.  (Each probe is hashed by each element of the
 8002  * probe tuple -- allowing for fast lookups, regardless of what was
 8003  * specified.)
 8004  */
 8005 static uint_t
 8006 dtrace_hash_str(const char *p)
 8007 {
 8008         unsigned int g;
 8009         uint_t hval = 0;
 8010 
 8011         while (*p) {
 8012                 hval = (hval << 4) + *p++;
 8013                 if ((g = (hval & 0xf0000000)) != 0)
 8014                         hval ^= g >> 24;
 8015                 hval &= ~g;
 8016         }
 8017         return (hval);
 8018 }
 8019 
 8020 static dtrace_hash_t *
 8021 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
 8022 {
 8023         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
 8024 
 8025         hash->dth_stroffs = stroffs;
 8026         hash->dth_nextoffs = nextoffs;
 8027         hash->dth_prevoffs = prevoffs;
 8028 
 8029         hash->dth_size = 1;
 8030         hash->dth_mask = hash->dth_size - 1;
 8031 
 8032         hash->dth_tab = kmem_zalloc(hash->dth_size *
 8033             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
 8034 
 8035         return (hash);
 8036 }
 8037 
 8038 static void
 8039 dtrace_hash_destroy(dtrace_hash_t *hash)
 8040 {
 8041 #ifdef DEBUG
 8042         int i;
 8043 
 8044         for (i = 0; i < hash->dth_size; i++)
 8045                 ASSERT(hash->dth_tab[i] == NULL);
 8046 #endif
 8047 
 8048         kmem_free(hash->dth_tab,
 8049             hash->dth_size * sizeof (dtrace_hashbucket_t *));
 8050         kmem_free(hash, sizeof (dtrace_hash_t));
 8051 }
 8052 
 8053 static void
 8054 dtrace_hash_resize(dtrace_hash_t *hash)
 8055 {
 8056         int size = hash->dth_size, i, ndx;
 8057         int new_size = hash->dth_size << 1;
 8058         int new_mask = new_size - 1;
 8059         dtrace_hashbucket_t **new_tab, *bucket, *next;
 8060 
 8061         ASSERT((new_size & new_mask) == 0);
 8062 
 8063         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
 8064 
 8065         for (i = 0; i < size; i++) {
 8066                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
 8067                         dtrace_probe_t *probe = bucket->dthb_chain;
 8068 
 8069                         ASSERT(probe != NULL);
 8070                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
 8071 
 8072                         next = bucket->dthb_next;
 8073                         bucket->dthb_next = new_tab[ndx];
 8074                         new_tab[ndx] = bucket;
 8075                 }
 8076         }
 8077 
 8078         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
 8079         hash->dth_tab = new_tab;
 8080         hash->dth_size = new_size;
 8081         hash->dth_mask = new_mask;
 8082 }
 8083 
 8084 static void
 8085 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
 8086 {
 8087         int hashval = DTRACE_HASHSTR(hash, new);
 8088         int ndx = hashval & hash->dth_mask;
 8089         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 8090         dtrace_probe_t **nextp, **prevp;
 8091 
 8092         for (; bucket != NULL; bucket = bucket->dthb_next) {
 8093                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
 8094                         goto add;
 8095         }
 8096 
 8097         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
 8098                 dtrace_hash_resize(hash);
 8099                 dtrace_hash_add(hash, new);
 8100                 return;
 8101         }
 8102 
 8103         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
 8104         bucket->dthb_next = hash->dth_tab[ndx];
 8105         hash->dth_tab[ndx] = bucket;
 8106         hash->dth_nbuckets++;
 8107 
 8108 add:
 8109         nextp = DTRACE_HASHNEXT(hash, new);
 8110         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
 8111         *nextp = bucket->dthb_chain;
 8112 
 8113         if (bucket->dthb_chain != NULL) {
 8114                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
 8115                 ASSERT(*prevp == NULL);
 8116                 *prevp = new;
 8117         }
 8118 
 8119         bucket->dthb_chain = new;
 8120         bucket->dthb_len++;
 8121 }
 8122 
 8123 static dtrace_probe_t *
 8124 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
 8125 {
 8126         int hashval = DTRACE_HASHSTR(hash, template);
 8127         int ndx = hashval & hash->dth_mask;
 8128         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 8129 
 8130         for (; bucket != NULL; bucket = bucket->dthb_next) {
 8131                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
 8132                         return (bucket->dthb_chain);
 8133         }
 8134 
 8135         return (NULL);
 8136 }
 8137 
 8138 static int
 8139 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
 8140 {
 8141         int hashval = DTRACE_HASHSTR(hash, template);
 8142         int ndx = hashval & hash->dth_mask;
 8143         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 8144 
 8145         for (; bucket != NULL; bucket = bucket->dthb_next) {
 8146                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
 8147                         return (bucket->dthb_len);
 8148         }
 8149 
 8150         return (0);
 8151 }
 8152 
 8153 static void
 8154 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
 8155 {
 8156         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
 8157         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 8158 
 8159         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
 8160         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
 8161 
 8162         /*
 8163          * Find the bucket that we're removing this probe from.
 8164          */
 8165         for (; bucket != NULL; bucket = bucket->dthb_next) {
 8166                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
 8167                         break;
 8168         }
 8169 
 8170         ASSERT(bucket != NULL);
 8171 
 8172         if (*prevp == NULL) {
 8173                 if (*nextp == NULL) {
 8174                         /*
 8175                          * The removed probe was the only probe on this
 8176                          * bucket; we need to remove the bucket.
 8177                          */
 8178                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
 8179 
 8180                         ASSERT(bucket->dthb_chain == probe);
 8181                         ASSERT(b != NULL);
 8182 
 8183                         if (b == bucket) {
 8184                                 hash->dth_tab[ndx] = bucket->dthb_next;
 8185                         } else {
 8186                                 while (b->dthb_next != bucket)
 8187                                         b = b->dthb_next;
 8188                                 b->dthb_next = bucket->dthb_next;
 8189                         }
 8190 
 8191                         ASSERT(hash->dth_nbuckets > 0);
 8192                         hash->dth_nbuckets--;
 8193                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
 8194                         return;
 8195                 }
 8196 
 8197                 bucket->dthb_chain = *nextp;
 8198         } else {
 8199                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
 8200         }
 8201 
 8202         if (*nextp != NULL)
 8203                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
 8204 }
 8205 
 8206 /*
 8207  * DTrace Utility Functions
 8208  *
 8209  * These are random utility functions that are _not_ called from probe context.
 8210  */
 8211 static int
 8212 dtrace_badattr(const dtrace_attribute_t *a)
 8213 {
 8214         return (a->dtat_name > DTRACE_STABILITY_MAX ||
 8215             a->dtat_data > DTRACE_STABILITY_MAX ||
 8216             a->dtat_class > DTRACE_CLASS_MAX);
 8217 }
 8218 
 8219 /*
 8220  * Return a duplicate copy of a string.  If the specified string is NULL,
 8221  * this function returns a zero-length string.
 8222  */
 8223 static char *
 8224 dtrace_strdup(const char *str)
 8225 {
 8226         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
 8227 
 8228         if (str != NULL)
 8229                 (void) strcpy(new, str);
 8230 
 8231         return (new);
 8232 }
 8233 
 8234 #define DTRACE_ISALPHA(c)       \
 8235         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
 8236 
 8237 static int
 8238 dtrace_badname(const char *s)
 8239 {
 8240         char c;
 8241 
 8242         if (s == NULL || (c = *s++) == '\0')
 8243                 return (0);
 8244 
 8245         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
 8246                 return (1);
 8247 
 8248         while ((c = *s++) != '\0') {
 8249                 if (!DTRACE_ISALPHA(c) && (c < '' || c > '9') &&
 8250                     c != '-' && c != '_' && c != '.' && c != '`')
 8251                         return (1);
 8252         }
 8253 
 8254         return (0);
 8255 }
 8256 
 8257 static void
 8258 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
 8259 {
 8260         uint32_t priv;
 8261 
 8262 #ifdef illumos
 8263         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
 8264                 /*
 8265                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
 8266                  */
 8267                 priv = DTRACE_PRIV_ALL;
 8268         } else {
 8269                 *uidp = crgetuid(cr);
 8270                 *zoneidp = crgetzoneid(cr);
 8271 
 8272                 priv = 0;
 8273                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
 8274                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
 8275                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
 8276                         priv |= DTRACE_PRIV_USER;
 8277                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
 8278                         priv |= DTRACE_PRIV_PROC;
 8279                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
 8280                         priv |= DTRACE_PRIV_OWNER;
 8281                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
 8282                         priv |= DTRACE_PRIV_ZONEOWNER;
 8283         }
 8284 #else
 8285         priv = DTRACE_PRIV_ALL;
 8286 #endif
 8287 
 8288         *privp = priv;
 8289 }
 8290 
 8291 #ifdef DTRACE_ERRDEBUG
 8292 static void
 8293 dtrace_errdebug(const char *str)
 8294 {
 8295         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
 8296         int occupied = 0;
 8297 
 8298         mutex_enter(&dtrace_errlock);
 8299         dtrace_errlast = str;
 8300         dtrace_errthread = curthread;
 8301 
 8302         while (occupied++ < DTRACE_ERRHASHSZ) {
 8303                 if (dtrace_errhash[hval].dter_msg == str) {
 8304                         dtrace_errhash[hval].dter_count++;
 8305                         goto out;
 8306                 }
 8307 
 8308                 if (dtrace_errhash[hval].dter_msg != NULL) {
 8309                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
 8310                         continue;
 8311                 }
 8312 
 8313                 dtrace_errhash[hval].dter_msg = str;
 8314                 dtrace_errhash[hval].dter_count = 1;
 8315                 goto out;
 8316         }
 8317 
 8318         panic("dtrace: undersized error hash");
 8319 out:
 8320         mutex_exit(&dtrace_errlock);
 8321 }
 8322 #endif
 8323 
 8324 /*
 8325  * DTrace Matching Functions
 8326  *
 8327  * These functions are used to match groups of probes, given some elements of
 8328  * a probe tuple, or some globbed expressions for elements of a probe tuple.
 8329  */
 8330 static int
 8331 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
 8332     zoneid_t zoneid)
 8333 {
 8334         if (priv != DTRACE_PRIV_ALL) {
 8335                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
 8336                 uint32_t match = priv & ppriv;
 8337 
 8338                 /*
 8339                  * No PRIV_DTRACE_* privileges...
 8340                  */
 8341                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
 8342                     DTRACE_PRIV_KERNEL)) == 0)
 8343                         return (0);
 8344 
 8345                 /*
 8346                  * No matching bits, but there were bits to match...
 8347                  */
 8348                 if (match == 0 && ppriv != 0)
 8349                         return (0);
 8350 
 8351                 /*
 8352                  * Need to have permissions to the process, but don't...
 8353                  */
 8354                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
 8355                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
 8356                         return (0);
 8357                 }
 8358 
 8359                 /*
 8360                  * Need to be in the same zone unless we possess the
 8361                  * privilege to examine all zones.
 8362                  */
 8363                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
 8364                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
 8365                         return (0);
 8366                 }
 8367         }
 8368 
 8369         return (1);
 8370 }
 8371 
 8372 /*
 8373  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
 8374  * consists of input pattern strings and an ops-vector to evaluate them.
 8375  * This function returns >0 for match, 0 for no match, and <0 for error.
 8376  */
 8377 static int
 8378 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
 8379     uint32_t priv, uid_t uid, zoneid_t zoneid)
 8380 {
 8381         dtrace_provider_t *pvp = prp->dtpr_provider;
 8382         int rv;
 8383 
 8384         if (pvp->dtpv_defunct)
 8385                 return (0);
 8386 
 8387         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
 8388                 return (rv);
 8389 
 8390         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
 8391                 return (rv);
 8392 
 8393         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
 8394                 return (rv);
 8395 
 8396         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
 8397                 return (rv);
 8398 
 8399         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
 8400                 return (0);
 8401 
 8402         return (rv);
 8403 }
 8404 
 8405 /*
 8406  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
 8407  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
 8408  * libc's version, the kernel version only applies to 8-bit ASCII strings.
 8409  * In addition, all of the recursion cases except for '*' matching have been
 8410  * unwound.  For '*', we still implement recursive evaluation, but a depth
 8411  * counter is maintained and matching is aborted if we recurse too deep.
 8412  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
 8413  */
 8414 static int
 8415 dtrace_match_glob(const char *s, const char *p, int depth)
 8416 {
 8417         const char *olds;
 8418         char s1, c;
 8419         int gs;
 8420 
 8421         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
 8422                 return (-1);
 8423 
 8424         if (s == NULL)
 8425                 s = ""; /* treat NULL as empty string */
 8426 
 8427 top:
 8428         olds = s;
 8429         s1 = *s++;
 8430 
 8431         if (p == NULL)
 8432                 return (0);
 8433 
 8434         if ((c = *p++) == '\0')
 8435                 return (s1 == '\0');
 8436 
 8437         switch (c) {
 8438         case '[': {
 8439                 int ok = 0, notflag = 0;
 8440                 char lc = '\0';
 8441 
 8442                 if (s1 == '\0')
 8443                         return (0);
 8444 
 8445                 if (*p == '!') {
 8446                         notflag = 1;
 8447                         p++;
 8448                 }
 8449 
 8450                 if ((c = *p++) == '\0')
 8451                         return (0);
 8452 
 8453                 do {
 8454                         if (c == '-' && lc != '\0' && *p != ']') {
 8455                                 if ((c = *p++) == '\0')
 8456                                         return (0);
 8457                                 if (c == '\\' && (c = *p++) == '\0')
 8458                                         return (0);
 8459 
 8460                                 if (notflag) {
 8461                                         if (s1 < lc || s1 > c)
 8462                                                 ok++;
 8463                                         else
 8464                                                 return (0);
 8465                                 } else if (lc <= s1 && s1 <= c)
 8466                                         ok++;
 8467 
 8468                         } else if (c == '\\' && (c = *p++) == '\0')
 8469                                 return (0);
 8470 
 8471                         lc = c; /* save left-hand 'c' for next iteration */
 8472 
 8473                         if (notflag) {
 8474                                 if (s1 != c)
 8475                                         ok++;
 8476                                 else
 8477                                         return (0);
 8478                         } else if (s1 == c)
 8479                                 ok++;
 8480 
 8481                         if ((c = *p++) == '\0')
 8482                                 return (0);
 8483 
 8484                 } while (c != ']');
 8485 
 8486                 if (ok)
 8487                         goto top;
 8488 
 8489                 return (0);
 8490         }
 8491 
 8492         case '\\':
 8493                 if ((c = *p++) == '\0')
 8494                         return (0);
 8495                 /*FALLTHRU*/
 8496 
 8497         default:
 8498                 if (c != s1)
 8499                         return (0);
 8500                 /*FALLTHRU*/
 8501 
 8502         case '?':
 8503                 if (s1 != '\0')
 8504                         goto top;
 8505                 return (0);
 8506 
 8507         case '*':
 8508                 while (*p == '*')
 8509                         p++; /* consecutive *'s are identical to a single one */
 8510 
 8511                 if (*p == '\0')
 8512                         return (1);
 8513 
 8514                 for (s = olds; *s != '\0'; s++) {
 8515                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
 8516                                 return (gs);
 8517                 }
 8518 
 8519                 return (0);
 8520         }
 8521 }
 8522 
 8523 /*ARGSUSED*/
 8524 static int
 8525 dtrace_match_string(const char *s, const char *p, int depth)
 8526 {
 8527         return (s != NULL && strcmp(s, p) == 0);
 8528 }
 8529 
 8530 /*ARGSUSED*/
 8531 static int
 8532 dtrace_match_nul(const char *s, const char *p, int depth)
 8533 {
 8534         return (1); /* always match the empty pattern */
 8535 }
 8536 
 8537 /*ARGSUSED*/
 8538 static int
 8539 dtrace_match_nonzero(const char *s, const char *p, int depth)
 8540 {
 8541         return (s != NULL && s[0] != '\0');
 8542 }
 8543 
 8544 static int
 8545 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
 8546     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
 8547 {
 8548         dtrace_probe_t template, *probe;
 8549         dtrace_hash_t *hash = NULL;
 8550         int len, best = INT_MAX, nmatched = 0;
 8551         dtrace_id_t i;
 8552 
 8553         ASSERT(MUTEX_HELD(&dtrace_lock));
 8554 
 8555         /*
 8556          * If the probe ID is specified in the key, just lookup by ID and
 8557          * invoke the match callback once if a matching probe is found.
 8558          */
 8559         if (pkp->dtpk_id != DTRACE_IDNONE) {
 8560                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
 8561                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
 8562                         (void) (*matched)(probe, arg);
 8563                         nmatched++;
 8564                 }
 8565                 return (nmatched);
 8566         }
 8567 
 8568         template.dtpr_mod = (char *)pkp->dtpk_mod;
 8569         template.dtpr_func = (char *)pkp->dtpk_func;
 8570         template.dtpr_name = (char *)pkp->dtpk_name;
 8571 
 8572         /*
 8573          * We want to find the most distinct of the module name, function
 8574          * name, and name.  So for each one that is not a glob pattern or
 8575          * empty string, we perform a lookup in the corresponding hash and
 8576          * use the hash table with the fewest collisions to do our search.
 8577          */
 8578         if (pkp->dtpk_mmatch == &dtrace_match_string &&
 8579             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
 8580                 best = len;
 8581                 hash = dtrace_bymod;
 8582         }
 8583 
 8584         if (pkp->dtpk_fmatch == &dtrace_match_string &&
 8585             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
 8586                 best = len;
 8587                 hash = dtrace_byfunc;
 8588         }
 8589 
 8590         if (pkp->dtpk_nmatch == &dtrace_match_string &&
 8591             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
 8592                 best = len;
 8593                 hash = dtrace_byname;
 8594         }
 8595 
 8596         /*
 8597          * If we did not select a hash table, iterate over every probe and
 8598          * invoke our callback for each one that matches our input probe key.
 8599          */
 8600         if (hash == NULL) {
 8601                 for (i = 0; i < dtrace_nprobes; i++) {
 8602                         if ((probe = dtrace_probes[i]) == NULL ||
 8603                             dtrace_match_probe(probe, pkp, priv, uid,
 8604                             zoneid) <= 0)
 8605                                 continue;
 8606 
 8607                         nmatched++;
 8608 
 8609                         if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
 8610                                 break;
 8611                 }
 8612 
 8613                 return (nmatched);
 8614         }
 8615 
 8616         /*
 8617          * If we selected a hash table, iterate over each probe of the same key
 8618          * name and invoke the callback for every probe that matches the other
 8619          * attributes of our input probe key.
 8620          */
 8621         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
 8622             probe = *(DTRACE_HASHNEXT(hash, probe))) {
 8623 
 8624                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
 8625                         continue;
 8626 
 8627                 nmatched++;
 8628 
 8629                 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
 8630                         break;
 8631         }
 8632 
 8633         return (nmatched);
 8634 }
 8635 
 8636 /*
 8637  * Return the function pointer dtrace_probecmp() should use to compare the
 8638  * specified pattern with a string.  For NULL or empty patterns, we select
 8639  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
 8640  * For non-empty non-glob strings, we use dtrace_match_string().
 8641  */
 8642 static dtrace_probekey_f *
 8643 dtrace_probekey_func(const char *p)
 8644 {
 8645         char c;
 8646 
 8647         if (p == NULL || *p == '\0')
 8648                 return (&dtrace_match_nul);
 8649 
 8650         while ((c = *p++) != '\0') {
 8651                 if (c == '[' || c == '?' || c == '*' || c == '\\')
 8652                         return (&dtrace_match_glob);
 8653         }
 8654 
 8655         return (&dtrace_match_string);
 8656 }
 8657 
 8658 /*
 8659  * Build a probe comparison key for use with dtrace_match_probe() from the
 8660  * given probe description.  By convention, a null key only matches anchored
 8661  * probes: if each field is the empty string, reset dtpk_fmatch to
 8662  * dtrace_match_nonzero().
 8663  */
 8664 static void
 8665 dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
 8666 {
 8667         pkp->dtpk_prov = pdp->dtpd_provider;
 8668         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
 8669 
 8670         pkp->dtpk_mod = pdp->dtpd_mod;
 8671         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
 8672 
 8673         pkp->dtpk_func = pdp->dtpd_func;
 8674         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
 8675 
 8676         pkp->dtpk_name = pdp->dtpd_name;
 8677         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
 8678 
 8679         pkp->dtpk_id = pdp->dtpd_id;
 8680 
 8681         if (pkp->dtpk_id == DTRACE_IDNONE &&
 8682             pkp->dtpk_pmatch == &dtrace_match_nul &&
 8683             pkp->dtpk_mmatch == &dtrace_match_nul &&
 8684             pkp->dtpk_fmatch == &dtrace_match_nul &&
 8685             pkp->dtpk_nmatch == &dtrace_match_nul)
 8686                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
 8687 }
 8688 
 8689 /*
 8690  * DTrace Provider-to-Framework API Functions
 8691  *
 8692  * These functions implement much of the Provider-to-Framework API, as
 8693  * described in <sys/dtrace.h>.  The parts of the API not in this section are
 8694  * the functions in the API for probe management (found below), and
 8695  * dtrace_probe() itself (found above).
 8696  */
 8697 
 8698 /*
 8699  * Register the calling provider with the DTrace framework.  This should
 8700  * generally be called by DTrace providers in their attach(9E) entry point.
 8701  */
 8702 int
 8703 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
 8704     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
 8705 {
 8706         dtrace_provider_t *provider;
 8707 
 8708         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
 8709                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
 8710                     "arguments", name ? name : "<NULL>");
 8711                 return (EINVAL);
 8712         }
 8713 
 8714         if (name[0] == '\0' || dtrace_badname(name)) {
 8715                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
 8716                     "provider name", name);
 8717                 return (EINVAL);
 8718         }
 8719 
 8720         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
 8721             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
 8722             pops->dtps_destroy == NULL ||
 8723             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
 8724                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
 8725                     "provider ops", name);
 8726                 return (EINVAL);
 8727         }
 8728 
 8729         if (dtrace_badattr(&pap->dtpa_provider) ||
 8730             dtrace_badattr(&pap->dtpa_mod) ||
 8731             dtrace_badattr(&pap->dtpa_func) ||
 8732             dtrace_badattr(&pap->dtpa_name) ||
 8733             dtrace_badattr(&pap->dtpa_args)) {
 8734                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
 8735                     "provider attributes", name);
 8736                 return (EINVAL);
 8737         }
 8738 
 8739         if (priv & ~DTRACE_PRIV_ALL) {
 8740                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
 8741                     "privilege attributes", name);
 8742                 return (EINVAL);
 8743         }
 8744 
 8745         if ((priv & DTRACE_PRIV_KERNEL) &&
 8746             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
 8747             pops->dtps_usermode == NULL) {
 8748                 cmn_err(CE_WARN, "failed to register provider '%s': need "
 8749                     "dtps_usermode() op for given privilege attributes", name);
 8750                 return (EINVAL);
 8751         }
 8752 
 8753         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
 8754         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 8755         (void) strcpy(provider->dtpv_name, name);
 8756 
 8757         provider->dtpv_attr = *pap;
 8758         provider->dtpv_priv.dtpp_flags = priv;
 8759         if (cr != NULL) {
 8760                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
 8761                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
 8762         }
 8763         provider->dtpv_pops = *pops;
 8764 
 8765         if (pops->dtps_provide == NULL) {
 8766                 ASSERT(pops->dtps_provide_module != NULL);
 8767                 provider->dtpv_pops.dtps_provide =
 8768                     (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
 8769         }
 8770 
 8771         if (pops->dtps_provide_module == NULL) {
 8772                 ASSERT(pops->dtps_provide != NULL);
 8773                 provider->dtpv_pops.dtps_provide_module =
 8774                     (void (*)(void *, modctl_t *))dtrace_nullop;
 8775         }
 8776 
 8777         if (pops->dtps_suspend == NULL) {
 8778                 ASSERT(pops->dtps_resume == NULL);
 8779                 provider->dtpv_pops.dtps_suspend =
 8780                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
 8781                 provider->dtpv_pops.dtps_resume =
 8782                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
 8783         }
 8784 
 8785         provider->dtpv_arg = arg;
 8786         *idp = (dtrace_provider_id_t)provider;
 8787 
 8788         if (pops == &dtrace_provider_ops) {
 8789                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
 8790                 ASSERT(MUTEX_HELD(&dtrace_lock));
 8791                 ASSERT(dtrace_anon.dta_enabling == NULL);
 8792 
 8793                 /*
 8794                  * We make sure that the DTrace provider is at the head of
 8795                  * the provider chain.
 8796                  */
 8797                 provider->dtpv_next = dtrace_provider;
 8798                 dtrace_provider = provider;
 8799                 return (0);
 8800         }
 8801 
 8802         mutex_enter(&dtrace_provider_lock);
 8803         mutex_enter(&dtrace_lock);
 8804 
 8805         /*
 8806          * If there is at least one provider registered, we'll add this
 8807          * provider after the first provider.
 8808          */
 8809         if (dtrace_provider != NULL) {
 8810                 provider->dtpv_next = dtrace_provider->dtpv_next;
 8811                 dtrace_provider->dtpv_next = provider;
 8812         } else {
 8813                 dtrace_provider = provider;
 8814         }
 8815 
 8816         if (dtrace_retained != NULL) {
 8817                 dtrace_enabling_provide(provider);
 8818 
 8819                 /*
 8820                  * Now we need to call dtrace_enabling_matchall() -- which
 8821                  * will acquire cpu_lock and dtrace_lock.  We therefore need
 8822                  * to drop all of our locks before calling into it...
 8823                  */
 8824                 mutex_exit(&dtrace_lock);
 8825                 mutex_exit(&dtrace_provider_lock);
 8826                 dtrace_enabling_matchall();
 8827 
 8828                 return (0);
 8829         }
 8830 
 8831         mutex_exit(&dtrace_lock);
 8832         mutex_exit(&dtrace_provider_lock);
 8833 
 8834         return (0);
 8835 }
 8836 
 8837 /*
 8838  * Unregister the specified provider from the DTrace framework.  This should
 8839  * generally be called by DTrace providers in their detach(9E) entry point.
 8840  */
 8841 int
 8842 dtrace_unregister(dtrace_provider_id_t id)
 8843 {
 8844         dtrace_provider_t *old = (dtrace_provider_t *)id;
 8845         dtrace_provider_t *prev = NULL;
 8846         int i, self = 0, noreap = 0;
 8847         dtrace_probe_t *probe, *first = NULL;
 8848 
 8849         if (old->dtpv_pops.dtps_enable ==
 8850             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
 8851                 /*
 8852                  * If DTrace itself is the provider, we're called with locks
 8853                  * already held.
 8854                  */
 8855                 ASSERT(old == dtrace_provider);
 8856 #ifdef illumos
 8857                 ASSERT(dtrace_devi != NULL);
 8858 #endif
 8859                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
 8860                 ASSERT(MUTEX_HELD(&dtrace_lock));
 8861                 self = 1;
 8862 
 8863                 if (dtrace_provider->dtpv_next != NULL) {
 8864                         /*
 8865                          * There's another provider here; return failure.
 8866                          */
 8867                         return (EBUSY);
 8868                 }
 8869         } else {
 8870                 mutex_enter(&dtrace_provider_lock);
 8871 #ifdef illumos
 8872                 mutex_enter(&mod_lock);
 8873 #endif
 8874                 mutex_enter(&dtrace_lock);
 8875         }
 8876 
 8877         /*
 8878          * If anyone has /dev/dtrace open, or if there are anonymous enabled
 8879          * probes, we refuse to let providers slither away, unless this
 8880          * provider has already been explicitly invalidated.
 8881          */
 8882         if (!old->dtpv_defunct &&
 8883             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
 8884             dtrace_anon.dta_state->dts_necbs > 0))) {
 8885                 if (!self) {
 8886                         mutex_exit(&dtrace_lock);
 8887 #ifdef illumos
 8888                         mutex_exit(&mod_lock);
 8889 #endif
 8890                         mutex_exit(&dtrace_provider_lock);
 8891                 }
 8892                 return (EBUSY);
 8893         }
 8894 
 8895         /*
 8896          * Attempt to destroy the probes associated with this provider.
 8897          */
 8898         for (i = 0; i < dtrace_nprobes; i++) {
 8899                 if ((probe = dtrace_probes[i]) == NULL)
 8900                         continue;
 8901 
 8902                 if (probe->dtpr_provider != old)
 8903                         continue;
 8904 
 8905                 if (probe->dtpr_ecb == NULL)
 8906                         continue;
 8907 
 8908                 /*
 8909                  * If we are trying to unregister a defunct provider, and the
 8910                  * provider was made defunct within the interval dictated by
 8911                  * dtrace_unregister_defunct_reap, we'll (asynchronously)
 8912                  * attempt to reap our enablings.  To denote that the provider
 8913                  * should reattempt to unregister itself at some point in the
 8914                  * future, we will return a differentiable error code (EAGAIN
 8915                  * instead of EBUSY) in this case.
 8916                  */
 8917                 if (dtrace_gethrtime() - old->dtpv_defunct >
 8918                     dtrace_unregister_defunct_reap)
 8919                         noreap = 1;
 8920 
 8921                 if (!self) {
 8922                         mutex_exit(&dtrace_lock);
 8923 #ifdef illumos
 8924                         mutex_exit(&mod_lock);
 8925 #endif
 8926                         mutex_exit(&dtrace_provider_lock);
 8927                 }
 8928 
 8929                 if (noreap)
 8930                         return (EBUSY);
 8931 
 8932                 (void) taskq_dispatch(dtrace_taskq,
 8933                     (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
 8934 
 8935                 return (EAGAIN);
 8936         }
 8937 
 8938         /*
 8939          * All of the probes for this provider are disabled; we can safely
 8940          * remove all of them from their hash chains and from the probe array.
 8941          */
 8942         for (i = 0; i < dtrace_nprobes; i++) {
 8943                 if ((probe = dtrace_probes[i]) == NULL)
 8944                         continue;
 8945 
 8946                 if (probe->dtpr_provider != old)
 8947                         continue;
 8948 
 8949                 dtrace_probes[i] = NULL;
 8950 
 8951                 dtrace_hash_remove(dtrace_bymod, probe);
 8952                 dtrace_hash_remove(dtrace_byfunc, probe);
 8953                 dtrace_hash_remove(dtrace_byname, probe);
 8954 
 8955                 if (first == NULL) {
 8956                         first = probe;
 8957                         probe->dtpr_nextmod = NULL;
 8958                 } else {
 8959                         probe->dtpr_nextmod = first;
 8960                         first = probe;
 8961                 }
 8962         }
 8963 
 8964         /*
 8965          * The provider's probes have been removed from the hash chains and
 8966          * from the probe array.  Now issue a dtrace_sync() to be sure that
 8967          * everyone has cleared out from any probe array processing.
 8968          */
 8969         dtrace_sync();
 8970 
 8971         for (probe = first; probe != NULL; probe = first) {
 8972                 first = probe->dtpr_nextmod;
 8973 
 8974                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
 8975                     probe->dtpr_arg);
 8976                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 8977                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 8978                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
 8979 #ifdef illumos
 8980                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
 8981 #else
 8982                 free_unr(dtrace_arena, probe->dtpr_id);
 8983 #endif
 8984                 kmem_free(probe, sizeof (dtrace_probe_t));
 8985         }
 8986 
 8987         if ((prev = dtrace_provider) == old) {
 8988 #ifdef illumos
 8989                 ASSERT(self || dtrace_devi == NULL);
 8990                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
 8991 #endif
 8992                 dtrace_provider = old->dtpv_next;
 8993         } else {
 8994                 while (prev != NULL && prev->dtpv_next != old)
 8995                         prev = prev->dtpv_next;
 8996 
 8997                 if (prev == NULL) {
 8998                         panic("attempt to unregister non-existent "
 8999                             "dtrace provider %p\n", (void *)id);
 9000                 }
 9001 
 9002                 prev->dtpv_next = old->dtpv_next;
 9003         }
 9004 
 9005         if (!self) {
 9006                 mutex_exit(&dtrace_lock);
 9007 #ifdef illumos
 9008                 mutex_exit(&mod_lock);
 9009 #endif
 9010                 mutex_exit(&dtrace_provider_lock);
 9011         }
 9012 
 9013         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
 9014         kmem_free(old, sizeof (dtrace_provider_t));
 9015 
 9016         return (0);
 9017 }
 9018 
 9019 /*
 9020  * Invalidate the specified provider.  All subsequent probe lookups for the
 9021  * specified provider will fail, but its probes will not be removed.
 9022  */
 9023 void
 9024 dtrace_invalidate(dtrace_provider_id_t id)
 9025 {
 9026         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
 9027 
 9028         ASSERT(pvp->dtpv_pops.dtps_enable !=
 9029             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
 9030 
 9031         mutex_enter(&dtrace_provider_lock);
 9032         mutex_enter(&dtrace_lock);
 9033 
 9034         pvp->dtpv_defunct = dtrace_gethrtime();
 9035 
 9036         mutex_exit(&dtrace_lock);
 9037         mutex_exit(&dtrace_provider_lock);
 9038 }
 9039 
 9040 /*
 9041  * Indicate whether or not DTrace has attached.
 9042  */
 9043 int
 9044 dtrace_attached(void)
 9045 {
 9046         /*
 9047          * dtrace_provider will be non-NULL iff the DTrace driver has
 9048          * attached.  (It's non-NULL because DTrace is always itself a
 9049          * provider.)
 9050          */
 9051         return (dtrace_provider != NULL);
 9052 }
 9053 
 9054 /*
 9055  * Remove all the unenabled probes for the given provider.  This function is
 9056  * not unlike dtrace_unregister(), except that it doesn't remove the provider
 9057  * -- just as many of its associated probes as it can.
 9058  */
 9059 int
 9060 dtrace_condense(dtrace_provider_id_t id)
 9061 {
 9062         dtrace_provider_t *prov = (dtrace_provider_t *)id;
 9063         int i;
 9064         dtrace_probe_t *probe;
 9065 
 9066         /*
 9067          * Make sure this isn't the dtrace provider itself.
 9068          */
 9069         ASSERT(prov->dtpv_pops.dtps_enable !=
 9070             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
 9071 
 9072         mutex_enter(&dtrace_provider_lock);
 9073         mutex_enter(&dtrace_lock);
 9074 
 9075         /*
 9076          * Attempt to destroy the probes associated with this provider.
 9077          */
 9078         for (i = 0; i < dtrace_nprobes; i++) {
 9079                 if ((probe = dtrace_probes[i]) == NULL)
 9080                         continue;
 9081 
 9082                 if (probe->dtpr_provider != prov)
 9083                         continue;
 9084 
 9085                 if (probe->dtpr_ecb != NULL)
 9086                         continue;
 9087 
 9088                 dtrace_probes[i] = NULL;
 9089 
 9090                 dtrace_hash_remove(dtrace_bymod, probe);
 9091                 dtrace_hash_remove(dtrace_byfunc, probe);
 9092                 dtrace_hash_remove(dtrace_byname, probe);
 9093 
 9094                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
 9095                     probe->dtpr_arg);
 9096                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
 9097                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
 9098                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
 9099                 kmem_free(probe, sizeof (dtrace_probe_t));
 9100 #ifdef illumos
 9101                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
 9102 #else
 9103                 free_unr(dtrace_arena, i + 1);
 9104 #endif
 9105         }
 9106 
 9107         mutex_exit(&dtrace_lock);
 9108         mutex_exit(&dtrace_provider_lock);
 9109 
 9110         return (0);
 9111 }
 9112 
 9113 /*
 9114  * DTrace Probe Management Functions
 9115  *
 9116  * The functions in this section perform the DTrace probe management,
 9117  * including functions to create probes, look-up probes, and call into the
 9118  * providers to request that probes be provided.  Some of these functions are
 9119  * in the Provider-to-Framework API; these functions can be identified by the
 9120  * fact that they are not declared "static".
 9121  */
 9122 
 9123 /*
 9124  * Create a probe with the specified module name, function name, and name.
 9125  */
 9126 dtrace_id_t
 9127 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
 9128     const char *func, const char *name, int aframes, void *arg)
 9129 {
 9130         dtrace_probe_t *probe, **probes;
 9131         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
 9132         dtrace_id_t id;
 9133 
 9134         if (provider == dtrace_provider) {
 9135                 ASSERT(MUTEX_HELD(&dtrace_lock));
 9136         } else {
 9137                 mutex_enter(&dtrace_lock);
 9138         }
 9139 
 9140 #ifdef illumos
 9141         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
 9142             VM_BESTFIT | VM_SLEEP);
 9143 #else
 9144         id = alloc_unr(dtrace_arena);
 9145 #endif
 9146         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
 9147 
 9148         probe->dtpr_id = id;
 9149         probe->dtpr_gen = dtrace_probegen++;
 9150         probe->dtpr_mod = dtrace_strdup(mod);
 9151         probe->dtpr_func = dtrace_strdup(func);
 9152         probe->dtpr_name = dtrace_strdup(name);
 9153         probe->dtpr_arg = arg;
 9154         probe->dtpr_aframes = aframes;
 9155         probe->dtpr_provider = provider;
 9156 
 9157         dtrace_hash_add(dtrace_bymod, probe);
 9158         dtrace_hash_add(dtrace_byfunc, probe);
 9159         dtrace_hash_add(dtrace_byname, probe);
 9160 
 9161         if (id - 1 >= dtrace_nprobes) {
 9162                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
 9163                 size_t nsize = osize << 1;
 9164 
 9165                 if (nsize == 0) {
 9166                         ASSERT(osize == 0);
 9167                         ASSERT(dtrace_probes == NULL);
 9168                         nsize = sizeof (dtrace_probe_t *);
 9169                 }
 9170 
 9171                 probes = kmem_zalloc(nsize, KM_SLEEP);
 9172 
 9173                 if (dtrace_probes == NULL) {
 9174                         ASSERT(osize == 0);
 9175                         dtrace_probes = probes;
 9176                         dtrace_nprobes = 1;
 9177                 } else {
 9178                         dtrace_probe_t **oprobes = dtrace_probes;
 9179 
 9180                         bcopy(oprobes, probes, osize);
 9181                         dtrace_membar_producer();
 9182                         dtrace_probes = probes;
 9183 
 9184                         dtrace_sync();
 9185 
 9186                         /*
 9187                          * All CPUs are now seeing the new probes array; we can
 9188                          * safely free the old array.
 9189                          */
 9190                         kmem_free(oprobes, osize);
 9191                         dtrace_nprobes <<= 1;
 9192                 }
 9193 
 9194                 ASSERT(id - 1 < dtrace_nprobes);
 9195         }
 9196 
 9197         ASSERT(dtrace_probes[id - 1] == NULL);
 9198         dtrace_probes[id - 1] = probe;
 9199 
 9200         if (provider != dtrace_provider)
 9201                 mutex_exit(&dtrace_lock);
 9202 
 9203         return (id);
 9204 }
 9205 
 9206 static dtrace_probe_t *
 9207 dtrace_probe_lookup_id(dtrace_id_t id)
 9208 {
 9209         ASSERT(MUTEX_HELD(&dtrace_lock));
 9210 
 9211         if (id == 0 || id > dtrace_nprobes)
 9212                 return (NULL);
 9213 
 9214         return (dtrace_probes[id - 1]);
 9215 }
 9216 
 9217 static int
 9218 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
 9219 {
 9220         *((dtrace_id_t *)arg) = probe->dtpr_id;
 9221 
 9222         return (DTRACE_MATCH_DONE);
 9223 }
 9224 
 9225 /*
 9226  * Look up a probe based on provider and one or more of module name, function
 9227  * name and probe name.
 9228  */
 9229 dtrace_id_t
 9230 dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
 9231     char *func, char *name)
 9232 {
 9233         dtrace_probekey_t pkey;
 9234         dtrace_id_t id;
 9235         int match;
 9236 
 9237         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
 9238         pkey.dtpk_pmatch = &dtrace_match_string;
 9239         pkey.dtpk_mod = mod;
 9240         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
 9241         pkey.dtpk_func = func;
 9242         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
 9243         pkey.dtpk_name = name;
 9244         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
 9245         pkey.dtpk_id = DTRACE_IDNONE;
 9246 
 9247         mutex_enter(&dtrace_lock);
 9248         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
 9249             dtrace_probe_lookup_match, &id);
 9250         mutex_exit(&dtrace_lock);
 9251 
 9252         ASSERT(match == 1 || match == 0);
 9253         return (match ? id : 0);
 9254 }
 9255 
 9256 /*
 9257  * Returns the probe argument associated with the specified probe.
 9258  */
 9259 void *
 9260 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
 9261 {
 9262         dtrace_probe_t *probe;
 9263         void *rval = NULL;
 9264 
 9265         mutex_enter(&dtrace_lock);
 9266 
 9267         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
 9268             probe->dtpr_provider == (dtrace_provider_t *)id)
 9269                 rval = probe->dtpr_arg;
 9270 
 9271         mutex_exit(&dtrace_lock);
 9272 
 9273         return (rval);
 9274 }
 9275 
 9276 /*
 9277  * Copy a probe into a probe description.
 9278  */
 9279 static void
 9280 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
 9281 {
 9282         bzero(pdp, sizeof (dtrace_probedesc_t));
 9283         pdp->dtpd_id = prp->dtpr_id;
 9284 
 9285         (void) strncpy(pdp->dtpd_provider,
 9286             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
 9287 
 9288         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
 9289         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
 9290         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
 9291 }
 9292 
 9293 /*
 9294  * Called to indicate that a probe -- or probes -- should be provided by a
 9295  * specfied provider.  If the specified description is NULL, the provider will
 9296  * be told to provide all of its probes.  (This is done whenever a new
 9297  * consumer comes along, or whenever a retained enabling is to be matched.) If
 9298  * the specified description is non-NULL, the provider is given the
 9299  * opportunity to dynamically provide the specified probe, allowing providers
 9300  * to support the creation of probes on-the-fly.  (So-called _autocreated_
 9301  * probes.)  If the provider is NULL, the operations will be applied to all
 9302  * providers; if the provider is non-NULL the operations will only be applied
 9303  * to the specified provider.  The dtrace_provider_lock must be held, and the
 9304  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
 9305  * will need to grab the dtrace_lock when it reenters the framework through
 9306  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
 9307  */
 9308 static void
 9309 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
 9310 {
 9311 #ifdef illumos
 9312         modctl_t *ctl;
 9313 #endif
 9314         int all = 0;
 9315 
 9316         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
 9317 
 9318         if (prv == NULL) {
 9319                 all = 1;
 9320                 prv = dtrace_provider;
 9321         }
 9322 
 9323         do {
 9324                 /*
 9325                  * First, call the blanket provide operation.
 9326                  */
 9327                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
 9328 
 9329 #ifdef illumos
 9330                 /*
 9331                  * Now call the per-module provide operation.  We will grab
 9332                  * mod_lock to prevent the list from being modified.  Note
 9333                  * that this also prevents the mod_busy bits from changing.
 9334                  * (mod_busy can only be changed with mod_lock held.)
 9335                  */
 9336                 mutex_enter(&mod_lock);
 9337 
 9338                 ctl = &modules;
 9339                 do {
 9340                         if (ctl->mod_busy || ctl->mod_mp == NULL)
 9341                                 continue;
 9342 
 9343                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
 9344 
 9345                 } while ((ctl = ctl->mod_next) != &modules);
 9346 
 9347                 mutex_exit(&mod_lock);
 9348 #endif
 9349         } while (all && (prv = prv->dtpv_next) != NULL);
 9350 }
 9351 
 9352 #ifdef illumos
 9353 /*
 9354  * Iterate over each probe, and call the Framework-to-Provider API function
 9355  * denoted by offs.
 9356  */
 9357 static void
 9358 dtrace_probe_foreach(uintptr_t offs)
 9359 {
 9360         dtrace_provider_t *prov;
 9361         void (*func)(void *, dtrace_id_t, void *);
 9362         dtrace_probe_t *probe;
 9363         dtrace_icookie_t cookie;
 9364         int i;
 9365 
 9366         /*
 9367          * We disable interrupts to walk through the probe array.  This is
 9368          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
 9369          * won't see stale data.
 9370          */
 9371         cookie = dtrace_interrupt_disable();
 9372 
 9373         for (i = 0; i < dtrace_nprobes; i++) {
 9374                 if ((probe = dtrace_probes[i]) == NULL)
 9375                         continue;
 9376 
 9377                 if (probe->dtpr_ecb == NULL) {
 9378                         /*
 9379                          * This probe isn't enabled -- don't call the function.
 9380                          */
 9381                         continue;
 9382                 }
 9383 
 9384                 prov = probe->dtpr_provider;
 9385                 func = *((void(**)(void *, dtrace_id_t, void *))
 9386                     ((uintptr_t)&prov->dtpv_pops + offs));
 9387 
 9388                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
 9389         }
 9390 
 9391         dtrace_interrupt_enable(cookie);
 9392 }
 9393 #endif
 9394 
 9395 static int
 9396 dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
 9397 {
 9398         dtrace_probekey_t pkey;
 9399         uint32_t priv;
 9400         uid_t uid;
 9401         zoneid_t zoneid;
 9402 
 9403         ASSERT(MUTEX_HELD(&dtrace_lock));
 9404         dtrace_ecb_create_cache = NULL;
 9405 
 9406         if (desc == NULL) {
 9407                 /*
 9408                  * If we're passed a NULL description, we're being asked to
 9409                  * create an ECB with a NULL probe.
 9410                  */
 9411                 (void) dtrace_ecb_create_enable(NULL, enab);
 9412                 return (0);
 9413         }
 9414 
 9415         dtrace_probekey(desc, &pkey);
 9416         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
 9417             &priv, &uid, &zoneid);
 9418 
 9419         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
 9420             enab));
 9421 }
 9422 
 9423 /*
 9424  * DTrace Helper Provider Functions
 9425  */
 9426 static void
 9427 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
 9428 {
 9429         attr->dtat_name = DOF_ATTR_NAME(dofattr);
 9430         attr->dtat_data = DOF_ATTR_DATA(dofattr);
 9431         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
 9432 }
 9433 
 9434 static void
 9435 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
 9436     const dof_provider_t *dofprov, char *strtab)
 9437 {
 9438         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
 9439         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
 9440             dofprov->dofpv_provattr);
 9441         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
 9442             dofprov->dofpv_modattr);
 9443         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
 9444             dofprov->dofpv_funcattr);
 9445         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
 9446             dofprov->dofpv_nameattr);
 9447         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
 9448             dofprov->dofpv_argsattr);
 9449 }
 9450 
 9451 static void
 9452 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
 9453 {
 9454         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
 9455         dof_hdr_t *dof = (dof_hdr_t *)daddr;
 9456         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
 9457         dof_provider_t *provider;
 9458         dof_probe_t *probe;
 9459         uint32_t *off, *enoff;
 9460         uint8_t *arg;
 9461         char *strtab;
 9462         uint_t i, nprobes;
 9463         dtrace_helper_provdesc_t dhpv;
 9464         dtrace_helper_probedesc_t dhpb;
 9465         dtrace_meta_t *meta = dtrace_meta_pid;
 9466         dtrace_mops_t *mops = &meta->dtm_mops;
 9467         void *parg;
 9468 
 9469         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
 9470         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9471             provider->dofpv_strtab * dof->dofh_secsize);
 9472         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9473             provider->dofpv_probes * dof->dofh_secsize);
 9474         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9475             provider->dofpv_prargs * dof->dofh_secsize);
 9476         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9477             provider->dofpv_proffs * dof->dofh_secsize);
 9478 
 9479         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
 9480         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
 9481         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
 9482         enoff = NULL;
 9483 
 9484         /*
 9485          * See dtrace_helper_provider_validate().
 9486          */
 9487         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
 9488             provider->dofpv_prenoffs != DOF_SECT_NONE) {
 9489                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9490                     provider->dofpv_prenoffs * dof->dofh_secsize);
 9491                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
 9492         }
 9493 
 9494         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
 9495 
 9496         /*
 9497          * Create the provider.
 9498          */
 9499         dtrace_dofprov2hprov(&dhpv, provider, strtab);
 9500 
 9501         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
 9502                 return;
 9503 
 9504         meta->dtm_count++;
 9505 
 9506         /*
 9507          * Create the probes.
 9508          */
 9509         for (i = 0; i < nprobes; i++) {
 9510                 probe = (dof_probe_t *)(uintptr_t)(daddr +
 9511                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
 9512 
 9513                 /* See the check in dtrace_helper_provider_validate(). */
 9514                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
 9515                         continue;
 9516 
 9517                 dhpb.dthpb_mod = dhp->dofhp_mod;
 9518                 dhpb.dthpb_func = strtab + probe->dofpr_func;
 9519                 dhpb.dthpb_name = strtab + probe->dofpr_name;
 9520                 dhpb.dthpb_base = probe->dofpr_addr;
 9521                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
 9522                 dhpb.dthpb_noffs = probe->dofpr_noffs;
 9523                 if (enoff != NULL) {
 9524                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
 9525                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
 9526                 } else {
 9527                         dhpb.dthpb_enoffs = NULL;
 9528                         dhpb.dthpb_nenoffs = 0;
 9529                 }
 9530                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
 9531                 dhpb.dthpb_nargc = probe->dofpr_nargc;
 9532                 dhpb.dthpb_xargc = probe->dofpr_xargc;
 9533                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
 9534                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
 9535 
 9536                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
 9537         }
 9538 }
 9539 
 9540 static void
 9541 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
 9542 {
 9543         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
 9544         dof_hdr_t *dof = (dof_hdr_t *)daddr;
 9545         int i;
 9546 
 9547         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
 9548 
 9549         for (i = 0; i < dof->dofh_secnum; i++) {
 9550                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
 9551                     dof->dofh_secoff + i * dof->dofh_secsize);
 9552 
 9553                 if (sec->dofs_type != DOF_SECT_PROVIDER)
 9554                         continue;
 9555 
 9556                 dtrace_helper_provide_one(dhp, sec, pid);
 9557         }
 9558 
 9559         /*
 9560          * We may have just created probes, so we must now rematch against
 9561          * any retained enablings.  Note that this call will acquire both
 9562          * cpu_lock and dtrace_lock; the fact that we are holding
 9563          * dtrace_meta_lock now is what defines the ordering with respect to
 9564          * these three locks.
 9565          */
 9566         dtrace_enabling_matchall();
 9567 }
 9568 
 9569 static void
 9570 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
 9571 {
 9572         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
 9573         dof_hdr_t *dof = (dof_hdr_t *)daddr;
 9574         dof_sec_t *str_sec;
 9575         dof_provider_t *provider;
 9576         char *strtab;
 9577         dtrace_helper_provdesc_t dhpv;
 9578         dtrace_meta_t *meta = dtrace_meta_pid;
 9579         dtrace_mops_t *mops = &meta->dtm_mops;
 9580 
 9581         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
 9582         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
 9583             provider->dofpv_strtab * dof->dofh_secsize);
 9584 
 9585         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
 9586 
 9587         /*
 9588          * Create the provider.
 9589          */
 9590         dtrace_dofprov2hprov(&dhpv, provider, strtab);
 9591 
 9592         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
 9593 
 9594         meta->dtm_count--;
 9595 }
 9596 
 9597 static void
 9598 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
 9599 {
 9600         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
 9601         dof_hdr_t *dof = (dof_hdr_t *)daddr;
 9602         int i;
 9603 
 9604         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
 9605 
 9606         for (i = 0; i < dof->dofh_secnum; i++) {
 9607                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
 9608                     dof->dofh_secoff + i * dof->dofh_secsize);
 9609 
 9610                 if (sec->dofs_type != DOF_SECT_PROVIDER)
 9611                         continue;
 9612 
 9613                 dtrace_helper_provider_remove_one(dhp, sec, pid);
 9614         }
 9615 }
 9616 
 9617 /*
 9618  * DTrace Meta Provider-to-Framework API Functions
 9619  *
 9620  * These functions implement the Meta Provider-to-Framework API, as described
 9621  * in <sys/dtrace.h>.
 9622  */
 9623 int
 9624 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
 9625     dtrace_meta_provider_id_t *idp)
 9626 {
 9627         dtrace_meta_t *meta;
 9628         dtrace_helpers_t *help, *next;
 9629         int i;
 9630 
 9631         *idp = DTRACE_METAPROVNONE;
 9632 
 9633         /*
 9634          * We strictly don't need the name, but we hold onto it for
 9635          * debuggability. All hail error queues!
 9636          */
 9637         if (name == NULL) {
 9638                 cmn_err(CE_WARN, "failed to register meta-provider: "
 9639                     "invalid name");
 9640                 return (EINVAL);
 9641         }
 9642 
 9643         if (mops == NULL ||
 9644             mops->dtms_create_probe == NULL ||
 9645             mops->dtms_provide_pid == NULL ||
 9646             mops->dtms_remove_pid == NULL) {
 9647                 cmn_err(CE_WARN, "failed to register meta-register %s: "
 9648                     "invalid ops", name);
 9649                 return (EINVAL);
 9650         }
 9651 
 9652         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
 9653         meta->dtm_mops = *mops;
 9654         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
 9655         (void) strcpy(meta->dtm_name, name);
 9656         meta->dtm_arg = arg;
 9657 
 9658         mutex_enter(&dtrace_meta_lock);
 9659         mutex_enter(&dtrace_lock);
 9660 
 9661         if (dtrace_meta_pid != NULL) {
 9662                 mutex_exit(&dtrace_lock);
 9663                 mutex_exit(&dtrace_meta_lock);
 9664                 cmn_err(CE_WARN, "failed to register meta-register %s: "
 9665                     "user-land meta-provider exists", name);
 9666                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
 9667                 kmem_free(meta, sizeof (dtrace_meta_t));
 9668                 return (EINVAL);
 9669         }
 9670 
 9671         dtrace_meta_pid = meta;
 9672         *idp = (dtrace_meta_provider_id_t)meta;
 9673 
 9674         /*
 9675          * If there are providers and probes ready to go, pass them
 9676          * off to the new meta provider now.
 9677          */
 9678 
 9679         help = dtrace_deferred_pid;
 9680         dtrace_deferred_pid = NULL;
 9681 
 9682         mutex_exit(&dtrace_lock);
 9683 
 9684         while (help != NULL) {
 9685                 for (i = 0; i < help->dthps_nprovs; i++) {
 9686                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
 9687                             help->dthps_pid);
 9688                 }
 9689 
 9690                 next = help->dthps_next;
 9691                 help->dthps_next = NULL;
 9692                 help->dthps_prev = NULL;
 9693                 help->dthps_deferred = 0;
 9694                 help = next;
 9695         }
 9696 
 9697         mutex_exit(&dtrace_meta_lock);
 9698 
 9699         return (0);
 9700 }
 9701 
 9702 int
 9703 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
 9704 {
 9705         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
 9706 
 9707         mutex_enter(&dtrace_meta_lock);
 9708         mutex_enter(&dtrace_lock);
 9709 
 9710         if (old == dtrace_meta_pid) {
 9711                 pp = &dtrace_meta_pid;
 9712         } else {
 9713                 panic("attempt to unregister non-existent "
 9714                     "dtrace meta-provider %p\n", (void *)old);
 9715         }
 9716 
 9717         if (old->dtm_count != 0) {
 9718                 mutex_exit(&dtrace_lock);
 9719                 mutex_exit(&dtrace_meta_lock);
 9720                 return (EBUSY);
 9721         }
 9722 
 9723         *pp = NULL;
 9724 
 9725         mutex_exit(&dtrace_lock);
 9726         mutex_exit(&dtrace_meta_lock);
 9727 
 9728         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
 9729         kmem_free(old, sizeof (dtrace_meta_t));
 9730 
 9731         return (0);
 9732 }
 9733 
 9734 
 9735 /*
 9736  * DTrace DIF Object Functions
 9737  */
 9738 static int
 9739 dtrace_difo_err(uint_t pc, const char *format, ...)
 9740 {
 9741         if (dtrace_err_verbose) {
 9742                 va_list alist;
 9743 
 9744                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
 9745                 va_start(alist, format);
 9746                 (void) vuprintf(format, alist);
 9747                 va_end(alist);
 9748         }
 9749 
 9750 #ifdef DTRACE_ERRDEBUG
 9751         dtrace_errdebug(format);
 9752 #endif
 9753         return (1);
 9754 }
 9755 
 9756 /*
 9757  * Validate a DTrace DIF object by checking the IR instructions.  The following
 9758  * rules are currently enforced by dtrace_difo_validate():
 9759  *
 9760  * 1. Each instruction must have a valid opcode
 9761  * 2. Each register, string, variable, or subroutine reference must be valid
 9762  * 3. No instruction can modify register %r0 (must be zero)
 9763  * 4. All instruction reserved bits must be set to zero
 9764  * 5. The last instruction must be a "ret" instruction
 9765  * 6. All branch targets must reference a valid instruction _after_ the branch
 9766  */
 9767 static int
 9768 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
 9769     cred_t *cr)
 9770 {
 9771         int err = 0, i;
 9772         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
 9773         int kcheckload;
 9774         uint_t pc;
 9775         int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
 9776 
 9777         kcheckload = cr == NULL ||
 9778             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
 9779 
 9780         dp->dtdo_destructive = 0;
 9781 
 9782         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
 9783                 dif_instr_t instr = dp->dtdo_buf[pc];
 9784 
 9785                 uint_t r1 = DIF_INSTR_R1(instr);
 9786                 uint_t r2 = DIF_INSTR_R2(instr);
 9787                 uint_t rd = DIF_INSTR_RD(instr);
 9788                 uint_t rs = DIF_INSTR_RS(instr);
 9789                 uint_t label = DIF_INSTR_LABEL(instr);
 9790                 uint_t v = DIF_INSTR_VAR(instr);
 9791                 uint_t subr = DIF_INSTR_SUBR(instr);
 9792                 uint_t type = DIF_INSTR_TYPE(instr);
 9793                 uint_t op = DIF_INSTR_OP(instr);
 9794 
 9795                 switch (op) {
 9796                 case DIF_OP_OR:
 9797                 case DIF_OP_XOR:
 9798                 case DIF_OP_AND:
 9799                 case DIF_OP_SLL:
 9800                 case DIF_OP_SRL:
 9801                 case DIF_OP_SRA:
 9802                 case DIF_OP_SUB:
 9803                 case DIF_OP_ADD:
 9804                 case DIF_OP_MUL:
 9805                 case DIF_OP_SDIV:
 9806                 case DIF_OP_UDIV:
 9807                 case DIF_OP_SREM:
 9808                 case DIF_OP_UREM:
 9809                 case DIF_OP_COPYS:
 9810                         if (r1 >= nregs)
 9811                                 err += efunc(pc, "invalid register %u\n", r1);
 9812                         if (r2 >= nregs)
 9813                                 err += efunc(pc, "invalid register %u\n", r2);
 9814                         if (rd >= nregs)
 9815                                 err += efunc(pc, "invalid register %u\n", rd);
 9816                         if (rd == 0)
 9817                                 err += efunc(pc, "cannot write to %%r0\n");
 9818                         break;
 9819                 case DIF_OP_NOT:
 9820                 case DIF_OP_MOV:
 9821                 case DIF_OP_ALLOCS:
 9822                         if (r1 >= nregs)
 9823                                 err += efunc(pc, "invalid register %u\n", r1);
 9824                         if (r2 != 0)
 9825                                 err += efunc(pc, "non-zero reserved bits\n");
 9826                         if (rd >= nregs)
 9827                                 err += efunc(pc, "invalid register %u\n", rd);
 9828                         if (rd == 0)
 9829                                 err += efunc(pc, "cannot write to %%r0\n");
 9830                         break;
 9831                 case DIF_OP_LDSB:
 9832                 case DIF_OP_LDSH:
 9833                 case DIF_OP_LDSW:
 9834                 case DIF_OP_LDUB:
 9835                 case DIF_OP_LDUH:
 9836                 case DIF_OP_LDUW:
 9837                 case DIF_OP_LDX:
 9838                         if (r1 >= nregs)
 9839                                 err += efunc(pc, "invalid register %u\n", r1);
 9840                         if (r2 != 0)
 9841                                 err += efunc(pc, "non-zero reserved bits\n");
 9842                         if (rd >= nregs)
 9843                                 err += efunc(pc, "invalid register %u\n", rd);
 9844                         if (rd == 0)
 9845                                 err += efunc(pc, "cannot write to %%r0\n");
 9846                         if (kcheckload)
 9847                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
 9848                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
 9849                         break;
 9850                 case DIF_OP_RLDSB:
 9851                 case DIF_OP_RLDSH:
 9852                 case DIF_OP_RLDSW:
 9853                 case DIF_OP_RLDUB:
 9854                 case DIF_OP_RLDUH:
 9855                 case DIF_OP_RLDUW:
 9856                 case DIF_OP_RLDX:
 9857                         if (r1 >= nregs)
 9858                                 err += efunc(pc, "invalid register %u\n", r1);
 9859                         if (r2 != 0)
 9860                                 err += efunc(pc, "non-zero reserved bits\n");
 9861                         if (rd >= nregs)
 9862                                 err += efunc(pc, "invalid register %u\n", rd);
 9863                         if (rd == 0)
 9864                                 err += efunc(pc, "cannot write to %%r0\n");
 9865                         break;
 9866                 case DIF_OP_ULDSB:
 9867                 case DIF_OP_ULDSH:
 9868                 case DIF_OP_ULDSW:
 9869                 case DIF_OP_ULDUB:
 9870                 case DIF_OP_ULDUH:
 9871                 case DIF_OP_ULDUW:
 9872                 case DIF_OP_ULDX:
 9873                         if (r1 >= nregs)
 9874                                 err += efunc(pc, "invalid register %u\n", r1);
 9875                         if (r2 != 0)
 9876                                 err += efunc(pc, "non-zero reserved bits\n");
 9877                         if (rd >= nregs)
 9878                                 err += efunc(pc, "invalid register %u\n", rd);
 9879                         if (rd == 0)
 9880                                 err += efunc(pc, "cannot write to %%r0\n");
 9881                         break;
 9882                 case DIF_OP_STB:
 9883                 case DIF_OP_STH:
 9884                 case DIF_OP_STW:
 9885                 case DIF_OP_STX:
 9886                         if (r1 >= nregs)
 9887                                 err += efunc(pc, "invalid register %u\n", r1);
 9888                         if (r2 != 0)
 9889                                 err += efunc(pc, "non-zero reserved bits\n");
 9890                         if (rd >= nregs)
 9891                                 err += efunc(pc, "invalid register %u\n", rd);
 9892                         if (rd == 0)
 9893                                 err += efunc(pc, "cannot write to 0 address\n");
 9894                         break;
 9895                 case DIF_OP_CMP:
 9896                 case DIF_OP_SCMP:
 9897                         if (r1 >= nregs)
 9898                                 err += efunc(pc, "invalid register %u\n", r1);
 9899                         if (r2 >= nregs)
 9900                                 err += efunc(pc, "invalid register %u\n", r2);
 9901                         if (rd != 0)
 9902                                 err += efunc(pc, "non-zero reserved bits\n");
 9903                         break;
 9904                 case DIF_OP_TST:
 9905                         if (r1 >= nregs)
 9906                                 err += efunc(pc, "invalid register %u\n", r1);
 9907                         if (r2 != 0 || rd != 0)
 9908                                 err += efunc(pc, "non-zero reserved bits\n");
 9909                         break;
 9910                 case DIF_OP_BA:
 9911                 case DIF_OP_BE:
 9912                 case DIF_OP_BNE:
 9913                 case DIF_OP_BG:
 9914                 case DIF_OP_BGU:
 9915                 case DIF_OP_BGE:
 9916                 case DIF_OP_BGEU:
 9917                 case DIF_OP_BL:
 9918                 case DIF_OP_BLU:
 9919                 case DIF_OP_BLE:
 9920                 case DIF_OP_BLEU:
 9921                         if (label >= dp->dtdo_len) {
 9922                                 err += efunc(pc, "invalid branch target %u\n",
 9923                                     label);
 9924                         }
 9925                         if (label <= pc) {
 9926                                 err += efunc(pc, "backward branch to %u\n",
 9927                                     label);
 9928                         }
 9929                         break;
 9930                 case DIF_OP_RET:
 9931                         if (r1 != 0 || r2 != 0)
 9932                                 err += efunc(pc, "non-zero reserved bits\n");
 9933                         if (rd >= nregs)
 9934                                 err += efunc(pc, "invalid register %u\n", rd);
 9935                         break;
 9936                 case DIF_OP_NOP:
 9937                 case DIF_OP_POPTS:
 9938                 case DIF_OP_FLUSHTS:
 9939                         if (r1 != 0 || r2 != 0 || rd != 0)
 9940                                 err += efunc(pc, "non-zero reserved bits\n");
 9941                         break;
 9942                 case DIF_OP_SETX:
 9943                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
 9944                                 err += efunc(pc, "invalid integer ref %u\n",
 9945                                     DIF_INSTR_INTEGER(instr));
 9946                         }
 9947                         if (rd >= nregs)
 9948                                 err += efunc(pc, "invalid register %u\n", rd);
 9949                         if (rd == 0)
 9950                                 err += efunc(pc, "cannot write to %%r0\n");
 9951                         break;
 9952                 case DIF_OP_SETS:
 9953                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
 9954                                 err += efunc(pc, "invalid string ref %u\n",
 9955                                     DIF_INSTR_STRING(instr));
 9956                         }
 9957                         if (rd >= nregs)
 9958                                 err += efunc(pc, "invalid register %u\n", rd);
 9959                         if (rd == 0)
 9960                                 err += efunc(pc, "cannot write to %%r0\n");
 9961                         break;
 9962                 case DIF_OP_LDGA:
 9963                 case DIF_OP_LDTA:
 9964                         if (r1 > DIF_VAR_ARRAY_MAX)
 9965                                 err += efunc(pc, "invalid array %u\n", r1);
 9966                         if (r2 >= nregs)
 9967                                 err += efunc(pc, "invalid register %u\n", r2);
 9968                         if (rd >= nregs)
 9969                                 err += efunc(pc, "invalid register %u\n", rd);
 9970                         if (rd == 0)
 9971                                 err += efunc(pc, "cannot write to %%r0\n");
 9972                         break;
 9973                 case DIF_OP_LDGS:
 9974                 case DIF_OP_LDTS:
 9975                 case DIF_OP_LDLS:
 9976                 case DIF_OP_LDGAA:
 9977                 case DIF_OP_LDTAA:
 9978                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
 9979                                 err += efunc(pc, "invalid variable %u\n", v);
 9980                         if (rd >= nregs)
 9981                                 err += efunc(pc, "invalid register %u\n", rd);
 9982                         if (rd == 0)
 9983                                 err += efunc(pc, "cannot write to %%r0\n");
 9984                         break;
 9985                 case DIF_OP_STGS:
 9986                 case DIF_OP_STTS:
 9987                 case DIF_OP_STLS:
 9988                 case DIF_OP_STGAA:
 9989                 case DIF_OP_STTAA:
 9990                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
 9991                                 err += efunc(pc, "invalid variable %u\n", v);
 9992                         if (rs >= nregs)
 9993                                 err += efunc(pc, "invalid register %u\n", rd);
 9994                         break;
 9995                 case DIF_OP_CALL:
 9996                         if (subr > DIF_SUBR_MAX)
 9997                                 err += efunc(pc, "invalid subr %u\n", subr);
 9998                         if (rd >= nregs)
 9999                                 err += efunc(pc, "invalid register %u\n", rd);
10000                         if (rd == 0)
10001                                 err += efunc(pc, "cannot write to %%r0\n");
10002 
10003                         if (subr == DIF_SUBR_COPYOUT ||
10004                             subr == DIF_SUBR_COPYOUTSTR) {
10005                                 dp->dtdo_destructive = 1;
10006                         }
10007 
10008                         if (subr == DIF_SUBR_GETF) {
10009 #ifdef __FreeBSD__
10010                                 err += efunc(pc, "getf() not supported");
10011 #else
10012                                 /*
10013                                  * If we have a getf() we need to record that
10014                                  * in our state.  Note that our state can be
10015                                  * NULL if this is a helper -- but in that
10016                                  * case, the call to getf() is itself illegal,
10017                                  * and will be caught (slightly later) when
10018                                  * the helper is validated.
10019                                  */
10020                                 if (vstate->dtvs_state != NULL)
10021                                         vstate->dtvs_state->dts_getf++;
10022 #endif
10023                         }
10024 
10025                         break;
10026                 case DIF_OP_PUSHTR:
10027                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
10028                                 err += efunc(pc, "invalid ref type %u\n", type);
10029                         if (r2 >= nregs)
10030                                 err += efunc(pc, "invalid register %u\n", r2);
10031                         if (rs >= nregs)
10032                                 err += efunc(pc, "invalid register %u\n", rs);
10033                         break;
10034                 case DIF_OP_PUSHTV:
10035                         if (type != DIF_TYPE_CTF)
10036                                 err += efunc(pc, "invalid val type %u\n", type);
10037                         if (r2 >= nregs)
10038                                 err += efunc(pc, "invalid register %u\n", r2);
10039                         if (rs >= nregs)
10040                                 err += efunc(pc, "invalid register %u\n", rs);
10041                         break;
10042                 default:
10043                         err += efunc(pc, "invalid opcode %u\n",
10044                             DIF_INSTR_OP(instr));
10045                 }
10046         }
10047 
10048         if (dp->dtdo_len != 0 &&
10049             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
10050                 err += efunc(dp->dtdo_len - 1,
10051                     "expected 'ret' as last DIF instruction\n");
10052         }
10053 
10054         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
10055                 /*
10056                  * If we're not returning by reference, the size must be either
10057                  * 0 or the size of one of the base types.
10058                  */
10059                 switch (dp->dtdo_rtype.dtdt_size) {
10060                 case 0:
10061                 case sizeof (uint8_t):
10062                 case sizeof (uint16_t):
10063                 case sizeof (uint32_t):
10064                 case sizeof (uint64_t):
10065                         break;
10066 
10067                 default:
10068                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
10069                 }
10070         }
10071 
10072         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
10073                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
10074                 dtrace_diftype_t *vt, *et;
10075                 uint_t id, ndx;
10076 
10077                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
10078                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
10079                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
10080                         err += efunc(i, "unrecognized variable scope %d\n",
10081                             v->dtdv_scope);
10082                         break;
10083                 }
10084 
10085                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
10086                     v->dtdv_kind != DIFV_KIND_SCALAR) {
10087                         err += efunc(i, "unrecognized variable type %d\n",
10088                             v->dtdv_kind);
10089                         break;
10090                 }
10091 
10092                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
10093                         err += efunc(i, "%d exceeds variable id limit\n", id);
10094                         break;
10095                 }
10096 
10097                 if (id < DIF_VAR_OTHER_UBASE)
10098                         continue;
10099 
10100                 /*
10101                  * For user-defined variables, we need to check that this
10102                  * definition is identical to any previous definition that we
10103                  * encountered.
10104                  */
10105                 ndx = id - DIF_VAR_OTHER_UBASE;
10106 
10107                 switch (v->dtdv_scope) {
10108                 case DIFV_SCOPE_GLOBAL:
10109                         if (maxglobal == -1 || ndx > maxglobal)
10110                                 maxglobal = ndx;
10111 
10112                         if (ndx < vstate->dtvs_nglobals) {
10113                                 dtrace_statvar_t *svar;
10114 
10115                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10116                                         existing = &svar->dtsv_var;
10117                         }
10118 
10119                         break;
10120 
10121                 case DIFV_SCOPE_THREAD:
10122                         if (maxtlocal == -1 || ndx > maxtlocal)
10123                                 maxtlocal = ndx;
10124 
10125                         if (ndx < vstate->dtvs_ntlocals)
10126                                 existing = &vstate->dtvs_tlocals[ndx];
10127                         break;
10128 
10129                 case DIFV_SCOPE_LOCAL:
10130                         if (maxlocal == -1 || ndx > maxlocal)
10131                                 maxlocal = ndx;
10132 
10133                         if (ndx < vstate->dtvs_nlocals) {
10134                                 dtrace_statvar_t *svar;
10135 
10136                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10137                                         existing = &svar->dtsv_var;
10138                         }
10139 
10140                         break;
10141                 }
10142 
10143                 vt = &v->dtdv_type;
10144 
10145                 if (vt->dtdt_flags & DIF_TF_BYREF) {
10146                         if (vt->dtdt_size == 0) {
10147                                 err += efunc(i, "zero-sized variable\n");
10148                                 break;
10149                         }
10150 
10151                         if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
10152                             v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10153                             vt->dtdt_size > dtrace_statvar_maxsize) {
10154                                 err += efunc(i, "oversized by-ref static\n");
10155                                 break;
10156                         }
10157                 }
10158 
10159                 if (existing == NULL || existing->dtdv_id == 0)
10160                         continue;
10161 
10162                 ASSERT(existing->dtdv_id == v->dtdv_id);
10163                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
10164 
10165                 if (existing->dtdv_kind != v->dtdv_kind)
10166                         err += efunc(i, "%d changed variable kind\n", id);
10167 
10168                 et = &existing->dtdv_type;
10169 
10170                 if (vt->dtdt_flags != et->dtdt_flags) {
10171                         err += efunc(i, "%d changed variable type flags\n", id);
10172                         break;
10173                 }
10174 
10175                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
10176                         err += efunc(i, "%d changed variable type size\n", id);
10177                         break;
10178                 }
10179         }
10180 
10181         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
10182                 dif_instr_t instr = dp->dtdo_buf[pc];
10183 
10184                 uint_t v = DIF_INSTR_VAR(instr);
10185                 uint_t op = DIF_INSTR_OP(instr);
10186 
10187                 switch (op) {
10188                 case DIF_OP_LDGS:
10189                 case DIF_OP_LDGAA:
10190                 case DIF_OP_STGS:
10191                 case DIF_OP_STGAA:
10192                         if (v > DIF_VAR_OTHER_UBASE + maxglobal)
10193                                 err += efunc(pc, "invalid variable %u\n", v);
10194                         break;
10195                 case DIF_OP_LDTS:
10196                 case DIF_OP_LDTAA:
10197                 case DIF_OP_STTS:
10198                 case DIF_OP_STTAA:
10199                         if (v > DIF_VAR_OTHER_UBASE + maxtlocal)
10200                                 err += efunc(pc, "invalid variable %u\n", v);
10201                         break;
10202                 case DIF_OP_LDLS:
10203                 case DIF_OP_STLS:
10204                         if (v > DIF_VAR_OTHER_UBASE + maxlocal)
10205                                 err += efunc(pc, "invalid variable %u\n", v);
10206                         break;
10207                 default:
10208                         break;
10209                 }
10210         }
10211 
10212         return (err);
10213 }
10214 
10215 /*
10216  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
10217  * are much more constrained than normal DIFOs.  Specifically, they may
10218  * not:
10219  *
10220  * 1. Make calls to subroutines other than copyin(), copyinstr() or
10221  *    miscellaneous string routines
10222  * 2. Access DTrace variables other than the args[] array, and the
10223  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10224  * 3. Have thread-local variables.
10225  * 4. Have dynamic variables.
10226  */
10227 static int
10228 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10229 {
10230         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10231         int err = 0;
10232         uint_t pc;
10233 
10234         for (pc = 0; pc < dp->dtdo_len; pc++) {
10235                 dif_instr_t instr = dp->dtdo_buf[pc];
10236 
10237                 uint_t v = DIF_INSTR_VAR(instr);
10238                 uint_t subr = DIF_INSTR_SUBR(instr);
10239                 uint_t op = DIF_INSTR_OP(instr);
10240 
10241                 switch (op) {
10242                 case DIF_OP_OR:
10243                 case DIF_OP_XOR:
10244                 case DIF_OP_AND:
10245                 case DIF_OP_SLL:
10246                 case DIF_OP_SRL:
10247                 case DIF_OP_SRA:
10248                 case DIF_OP_SUB:
10249                 case DIF_OP_ADD:
10250                 case DIF_OP_MUL:
10251                 case DIF_OP_SDIV:
10252                 case DIF_OP_UDIV:
10253                 case DIF_OP_SREM:
10254                 case DIF_OP_UREM:
10255                 case DIF_OP_COPYS:
10256                 case DIF_OP_NOT:
10257                 case DIF_OP_MOV:
10258                 case DIF_OP_RLDSB:
10259                 case DIF_OP_RLDSH:
10260                 case DIF_OP_RLDSW:
10261                 case DIF_OP_RLDUB:
10262                 case DIF_OP_RLDUH:
10263                 case DIF_OP_RLDUW:
10264                 case DIF_OP_RLDX:
10265                 case DIF_OP_ULDSB:
10266                 case DIF_OP_ULDSH:
10267                 case DIF_OP_ULDSW:
10268                 case DIF_OP_ULDUB:
10269                 case DIF_OP_ULDUH:
10270                 case DIF_OP_ULDUW:
10271                 case DIF_OP_ULDX:
10272                 case DIF_OP_STB:
10273                 case DIF_OP_STH:
10274                 case DIF_OP_STW:
10275                 case DIF_OP_STX:
10276                 case DIF_OP_ALLOCS:
10277                 case DIF_OP_CMP:
10278                 case DIF_OP_SCMP:
10279                 case DIF_OP_TST:
10280                 case DIF_OP_BA:
10281                 case DIF_OP_BE:
10282                 case DIF_OP_BNE:
10283                 case DIF_OP_BG:
10284                 case DIF_OP_BGU:
10285                 case DIF_OP_BGE:
10286                 case DIF_OP_BGEU:
10287                 case DIF_OP_BL:
10288                 case DIF_OP_BLU:
10289                 case DIF_OP_BLE:
10290                 case DIF_OP_BLEU:
10291                 case DIF_OP_RET:
10292                 case DIF_OP_NOP:
10293                 case DIF_OP_POPTS:
10294                 case DIF_OP_FLUSHTS:
10295                 case DIF_OP_SETX:
10296                 case DIF_OP_SETS:
10297                 case DIF_OP_LDGA:
10298                 case DIF_OP_LDLS:
10299                 case DIF_OP_STGS:
10300                 case DIF_OP_STLS:
10301                 case DIF_OP_PUSHTR:
10302                 case DIF_OP_PUSHTV:
10303                         break;
10304 
10305                 case DIF_OP_LDGS:
10306                         if (v >= DIF_VAR_OTHER_UBASE)
10307                                 break;
10308 
10309                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10310                                 break;
10311 
10312                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10313                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10314                             v == DIF_VAR_EXECARGS ||
10315                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10316                             v == DIF_VAR_UID || v == DIF_VAR_GID)
10317                                 break;
10318 
10319                         err += efunc(pc, "illegal variable %u\n", v);
10320                         break;
10321 
10322                 case DIF_OP_LDTA:
10323                 case DIF_OP_LDTS:
10324                 case DIF_OP_LDGAA:
10325                 case DIF_OP_LDTAA:
10326                         err += efunc(pc, "illegal dynamic variable load\n");
10327                         break;
10328 
10329                 case DIF_OP_STTS:
10330                 case DIF_OP_STGAA:
10331                 case DIF_OP_STTAA:
10332                         err += efunc(pc, "illegal dynamic variable store\n");
10333                         break;
10334 
10335                 case DIF_OP_CALL:
10336                         if (subr == DIF_SUBR_ALLOCA ||
10337                             subr == DIF_SUBR_BCOPY ||
10338                             subr == DIF_SUBR_COPYIN ||
10339                             subr == DIF_SUBR_COPYINTO ||
10340                             subr == DIF_SUBR_COPYINSTR ||
10341                             subr == DIF_SUBR_INDEX ||
10342                             subr == DIF_SUBR_INET_NTOA ||
10343                             subr == DIF_SUBR_INET_NTOA6 ||
10344                             subr == DIF_SUBR_INET_NTOP ||
10345                             subr == DIF_SUBR_JSON ||
10346                             subr == DIF_SUBR_LLTOSTR ||
10347                             subr == DIF_SUBR_STRTOLL ||
10348                             subr == DIF_SUBR_RINDEX ||
10349                             subr == DIF_SUBR_STRCHR ||
10350                             subr == DIF_SUBR_STRJOIN ||
10351                             subr == DIF_SUBR_STRRCHR ||
10352                             subr == DIF_SUBR_STRSTR ||
10353                             subr == DIF_SUBR_HTONS ||
10354                             subr == DIF_SUBR_HTONL ||
10355                             subr == DIF_SUBR_HTONLL ||
10356                             subr == DIF_SUBR_NTOHS ||
10357                             subr == DIF_SUBR_NTOHL ||
10358                             subr == DIF_SUBR_NTOHLL ||
10359                             subr == DIF_SUBR_MEMREF)
10360                                 break;
10361 #ifdef __FreeBSD__
10362                         if (subr == DIF_SUBR_MEMSTR)
10363                                 break;
10364 #endif
10365 
10366                         err += efunc(pc, "invalid subr %u\n", subr);
10367                         break;
10368 
10369                 default:
10370                         err += efunc(pc, "invalid opcode %u\n",
10371                             DIF_INSTR_OP(instr));
10372                 }
10373         }
10374 
10375         return (err);
10376 }
10377 
10378 /*
10379  * Returns 1 if the expression in the DIF object can be cached on a per-thread
10380  * basis; 0 if not.
10381  */
10382 static int
10383 dtrace_difo_cacheable(dtrace_difo_t *dp)
10384 {
10385         int i;
10386 
10387         if (dp == NULL)
10388                 return (0);
10389 
10390         for (i = 0; i < dp->dtdo_varlen; i++) {
10391                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10392 
10393                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10394                         continue;
10395 
10396                 switch (v->dtdv_id) {
10397                 case DIF_VAR_CURTHREAD:
10398                 case DIF_VAR_PID:
10399                 case DIF_VAR_TID:
10400                 case DIF_VAR_EXECARGS:
10401                 case DIF_VAR_EXECNAME:
10402                 case DIF_VAR_ZONENAME:
10403                         break;
10404 
10405                 default:
10406                         return (0);
10407                 }
10408         }
10409 
10410         /*
10411          * This DIF object may be cacheable.  Now we need to look for any
10412          * array loading instructions, any memory loading instructions, or
10413          * any stores to thread-local variables.
10414          */
10415         for (i = 0; i < dp->dtdo_len; i++) {
10416                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10417 
10418                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10419                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10420                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10421                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
10422                         return (0);
10423         }
10424 
10425         return (1);
10426 }
10427 
10428 static void
10429 dtrace_difo_hold(dtrace_difo_t *dp)
10430 {
10431         int i;
10432 
10433         ASSERT(MUTEX_HELD(&dtrace_lock));
10434 
10435         dp->dtdo_refcnt++;
10436         ASSERT(dp->dtdo_refcnt != 0);
10437 
10438         /*
10439          * We need to check this DIF object for references to the variable
10440          * DIF_VAR_VTIMESTAMP.
10441          */
10442         for (i = 0; i < dp->dtdo_varlen; i++) {
10443                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10444 
10445                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10446                         continue;
10447 
10448                 if (dtrace_vtime_references++ == 0)
10449                         dtrace_vtime_enable();
10450         }
10451 }
10452 
10453 /*
10454  * This routine calculates the dynamic variable chunksize for a given DIF
10455  * object.  The calculation is not fool-proof, and can probably be tricked by
10456  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10457  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10458  * if a dynamic variable size exceeds the chunksize.
10459  */
10460 static void
10461 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10462 {
10463         uint64_t sval = 0;
10464         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10465         const dif_instr_t *text = dp->dtdo_buf;
10466         uint_t pc, srd = 0;
10467         uint_t ttop = 0;
10468         size_t size, ksize;
10469         uint_t id, i;
10470 
10471         for (pc = 0; pc < dp->dtdo_len; pc++) {
10472                 dif_instr_t instr = text[pc];
10473                 uint_t op = DIF_INSTR_OP(instr);
10474                 uint_t rd = DIF_INSTR_RD(instr);
10475                 uint_t r1 = DIF_INSTR_R1(instr);
10476                 uint_t nkeys = 0;
10477                 uchar_t scope = 0;
10478 
10479                 dtrace_key_t *key = tupregs;
10480 
10481                 switch (op) {
10482                 case DIF_OP_SETX:
10483                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10484                         srd = rd;
10485                         continue;
10486 
10487                 case DIF_OP_STTS:
10488                         key = &tupregs[DIF_DTR_NREGS];
10489                         key[0].dttk_size = 0;
10490                         key[1].dttk_size = 0;
10491                         nkeys = 2;
10492                         scope = DIFV_SCOPE_THREAD;
10493                         break;
10494 
10495                 case DIF_OP_STGAA:
10496                 case DIF_OP_STTAA:
10497                         nkeys = ttop;
10498 
10499                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10500                                 key[nkeys++].dttk_size = 0;
10501 
10502                         key[nkeys++].dttk_size = 0;
10503 
10504                         if (op == DIF_OP_STTAA) {
10505                                 scope = DIFV_SCOPE_THREAD;
10506                         } else {
10507                                 scope = DIFV_SCOPE_GLOBAL;
10508                         }
10509 
10510                         break;
10511 
10512                 case DIF_OP_PUSHTR:
10513                         if (ttop == DIF_DTR_NREGS)
10514                                 return;
10515 
10516                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10517                                 /*
10518                                  * If the register for the size of the "pushtr"
10519                                  * is %r0 (or the value is 0) and the type is
10520                                  * a string, we'll use the system-wide default
10521                                  * string size.
10522                                  */
10523                                 tupregs[ttop++].dttk_size =
10524                                     dtrace_strsize_default;
10525                         } else {
10526                                 if (srd == 0)
10527                                         return;
10528 
10529                                 if (sval > LONG_MAX)
10530                                         return;
10531 
10532                                 tupregs[ttop++].dttk_size = sval;
10533                         }
10534 
10535                         break;
10536 
10537                 case DIF_OP_PUSHTV:
10538                         if (ttop == DIF_DTR_NREGS)
10539                                 return;
10540 
10541                         tupregs[ttop++].dttk_size = 0;
10542                         break;
10543 
10544                 case DIF_OP_FLUSHTS:
10545                         ttop = 0;
10546                         break;
10547 
10548                 case DIF_OP_POPTS:
10549                         if (ttop != 0)
10550                                 ttop--;
10551                         break;
10552                 }
10553 
10554                 sval = 0;
10555                 srd = 0;
10556 
10557                 if (nkeys == 0)
10558                         continue;
10559 
10560                 /*
10561                  * We have a dynamic variable allocation; calculate its size.
10562                  */
10563                 for (ksize = 0, i = 0; i < nkeys; i++)
10564                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10565 
10566                 size = sizeof (dtrace_dynvar_t);
10567                 size += sizeof (dtrace_key_t) * (nkeys - 1);
10568                 size += ksize;
10569 
10570                 /*
10571                  * Now we need to determine the size of the stored data.
10572                  */
10573                 id = DIF_INSTR_VAR(instr);
10574 
10575                 for (i = 0; i < dp->dtdo_varlen; i++) {
10576                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
10577 
10578                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
10579                                 size += v->dtdv_type.dtdt_size;
10580                                 break;
10581                         }
10582                 }
10583 
10584                 if (i == dp->dtdo_varlen)
10585                         return;
10586 
10587                 /*
10588                  * We have the size.  If this is larger than the chunk size
10589                  * for our dynamic variable state, reset the chunk size.
10590                  */
10591                 size = P2ROUNDUP(size, sizeof (uint64_t));
10592 
10593                 /*
10594                  * Before setting the chunk size, check that we're not going
10595                  * to set it to a negative value...
10596                  */
10597                 if (size > LONG_MAX)
10598                         return;
10599 
10600                 /*
10601                  * ...and make certain that we didn't badly overflow.
10602                  */
10603                 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10604                         return;
10605 
10606                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10607                         vstate->dtvs_dynvars.dtds_chunksize = size;
10608         }
10609 }
10610 
10611 static void
10612 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10613 {
10614         int i, oldsvars, osz, nsz, otlocals, ntlocals;
10615         uint_t id;
10616 
10617         ASSERT(MUTEX_HELD(&dtrace_lock));
10618         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10619 
10620         for (i = 0; i < dp->dtdo_varlen; i++) {
10621                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10622                 dtrace_statvar_t *svar, ***svarp = NULL;
10623                 size_t dsize = 0;
10624                 uint8_t scope = v->dtdv_scope;
10625                 int *np = NULL;
10626 
10627                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10628                         continue;
10629 
10630                 id -= DIF_VAR_OTHER_UBASE;
10631 
10632                 switch (scope) {
10633                 case DIFV_SCOPE_THREAD:
10634                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10635                                 dtrace_difv_t *tlocals;
10636 
10637                                 if ((ntlocals = (otlocals << 1)) == 0)
10638                                         ntlocals = 1;
10639 
10640                                 osz = otlocals * sizeof (dtrace_difv_t);
10641                                 nsz = ntlocals * sizeof (dtrace_difv_t);
10642 
10643                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10644 
10645                                 if (osz != 0) {
10646                                         bcopy(vstate->dtvs_tlocals,
10647                                             tlocals, osz);
10648                                         kmem_free(vstate->dtvs_tlocals, osz);
10649                                 }
10650 
10651                                 vstate->dtvs_tlocals = tlocals;
10652                                 vstate->dtvs_ntlocals = ntlocals;
10653                         }
10654 
10655                         vstate->dtvs_tlocals[id] = *v;
10656                         continue;
10657 
10658                 case DIFV_SCOPE_LOCAL:
10659                         np = &vstate->dtvs_nlocals;
10660                         svarp = &vstate->dtvs_locals;
10661 
10662                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10663                                 dsize = NCPU * (v->dtdv_type.dtdt_size +
10664                                     sizeof (uint64_t));
10665                         else
10666                                 dsize = NCPU * sizeof (uint64_t);
10667 
10668                         break;
10669 
10670                 case DIFV_SCOPE_GLOBAL:
10671                         np = &vstate->dtvs_nglobals;
10672                         svarp = &vstate->dtvs_globals;
10673 
10674                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10675                                 dsize = v->dtdv_type.dtdt_size +
10676                                     sizeof (uint64_t);
10677 
10678                         break;
10679 
10680                 default:
10681                         ASSERT(0);
10682                 }
10683 
10684                 while (id >= (oldsvars = *np)) {
10685                         dtrace_statvar_t **statics;
10686                         int newsvars, oldsize, newsize;
10687 
10688                         if ((newsvars = (oldsvars << 1)) == 0)
10689                                 newsvars = 1;
10690 
10691                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10692                         newsize = newsvars * sizeof (dtrace_statvar_t *);
10693 
10694                         statics = kmem_zalloc(newsize, KM_SLEEP);
10695 
10696                         if (oldsize != 0) {
10697                                 bcopy(*svarp, statics, oldsize);
10698                                 kmem_free(*svarp, oldsize);
10699                         }
10700 
10701                         *svarp = statics;
10702                         *np = newsvars;
10703                 }
10704 
10705                 if ((svar = (*svarp)[id]) == NULL) {
10706                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10707                         svar->dtsv_var = *v;
10708 
10709                         if ((svar->dtsv_size = dsize) != 0) {
10710                                 svar->dtsv_data = (uint64_t)(uintptr_t)
10711                                     kmem_zalloc(dsize, KM_SLEEP);
10712                         }
10713 
10714                         (*svarp)[id] = svar;
10715                 }
10716 
10717                 svar->dtsv_refcnt++;
10718         }
10719 
10720         dtrace_difo_chunksize(dp, vstate);
10721         dtrace_difo_hold(dp);
10722 }
10723 
10724 static dtrace_difo_t *
10725 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10726 {
10727         dtrace_difo_t *new;
10728         size_t sz;
10729 
10730         ASSERT(dp->dtdo_buf != NULL);
10731         ASSERT(dp->dtdo_refcnt != 0);
10732 
10733         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10734 
10735         ASSERT(dp->dtdo_buf != NULL);
10736         sz = dp->dtdo_len * sizeof (dif_instr_t);
10737         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10738         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10739         new->dtdo_len = dp->dtdo_len;
10740 
10741         if (dp->dtdo_strtab != NULL) {
10742                 ASSERT(dp->dtdo_strlen != 0);
10743                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10744                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10745                 new->dtdo_strlen = dp->dtdo_strlen;
10746         }
10747 
10748         if (dp->dtdo_inttab != NULL) {
10749                 ASSERT(dp->dtdo_intlen != 0);
10750                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10751                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10752                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10753                 new->dtdo_intlen = dp->dtdo_intlen;
10754         }
10755 
10756         if (dp->dtdo_vartab != NULL) {
10757                 ASSERT(dp->dtdo_varlen != 0);
10758                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10759                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10760                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10761                 new->dtdo_varlen = dp->dtdo_varlen;
10762         }
10763 
10764         dtrace_difo_init(new, vstate);
10765         return (new);
10766 }
10767 
10768 static void
10769 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10770 {
10771         int i;
10772 
10773         ASSERT(dp->dtdo_refcnt == 0);
10774 
10775         for (i = 0; i < dp->dtdo_varlen; i++) {
10776                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10777                 dtrace_statvar_t *svar, **svarp = NULL;
10778                 uint_t id;
10779                 uint8_t scope = v->dtdv_scope;
10780                 int *np = NULL;
10781 
10782                 switch (scope) {
10783                 case DIFV_SCOPE_THREAD:
10784                         continue;
10785 
10786                 case DIFV_SCOPE_LOCAL:
10787                         np = &vstate->dtvs_nlocals;
10788                         svarp = vstate->dtvs_locals;
10789                         break;
10790 
10791                 case DIFV_SCOPE_GLOBAL:
10792                         np = &vstate->dtvs_nglobals;
10793                         svarp = vstate->dtvs_globals;
10794                         break;
10795 
10796                 default:
10797                         ASSERT(0);
10798                 }
10799 
10800                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10801                         continue;
10802 
10803                 id -= DIF_VAR_OTHER_UBASE;
10804                 ASSERT(id < *np);
10805 
10806                 svar = svarp[id];
10807                 ASSERT(svar != NULL);
10808                 ASSERT(svar->dtsv_refcnt > 0);
10809 
10810                 if (--svar->dtsv_refcnt > 0)
10811                         continue;
10812 
10813                 if (svar->dtsv_size != 0) {
10814                         ASSERT(svar->dtsv_data != 0);
10815                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10816                             svar->dtsv_size);
10817                 }
10818 
10819                 kmem_free(svar, sizeof (dtrace_statvar_t));
10820                 svarp[id] = NULL;
10821         }
10822 
10823         if (dp->dtdo_buf != NULL)
10824                 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10825         if (dp->dtdo_inttab != NULL)
10826                 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10827         if (dp->dtdo_strtab != NULL)
10828                 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10829         if (dp->dtdo_vartab != NULL)
10830                 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10831 
10832         kmem_free(dp, sizeof (dtrace_difo_t));
10833 }
10834 
10835 static void
10836 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10837 {
10838         int i;
10839 
10840         ASSERT(MUTEX_HELD(&dtrace_lock));
10841         ASSERT(dp->dtdo_refcnt != 0);
10842 
10843         for (i = 0; i < dp->dtdo_varlen; i++) {
10844                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10845 
10846                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10847                         continue;
10848 
10849                 ASSERT(dtrace_vtime_references > 0);
10850                 if (--dtrace_vtime_references == 0)
10851                         dtrace_vtime_disable();
10852         }
10853 
10854         if (--dp->dtdo_refcnt == 0)
10855                 dtrace_difo_destroy(dp, vstate);
10856 }
10857 
10858 /*
10859  * DTrace Format Functions
10860  */
10861 static uint16_t
10862 dtrace_format_add(dtrace_state_t *state, char *str)
10863 {
10864         char *fmt, **new;
10865         uint16_t ndx, len = strlen(str) + 1;
10866 
10867         fmt = kmem_zalloc(len, KM_SLEEP);
10868         bcopy(str, fmt, len);
10869 
10870         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10871                 if (state->dts_formats[ndx] == NULL) {
10872                         state->dts_formats[ndx] = fmt;
10873                         return (ndx + 1);
10874                 }
10875         }
10876 
10877         if (state->dts_nformats == USHRT_MAX) {
10878                 /*
10879                  * This is only likely if a denial-of-service attack is being
10880                  * attempted.  As such, it's okay to fail silently here.
10881                  */
10882                 kmem_free(fmt, len);
10883                 return (0);
10884         }
10885 
10886         /*
10887          * For simplicity, we always resize the formats array to be exactly the
10888          * number of formats.
10889          */
10890         ndx = state->dts_nformats++;
10891         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10892 
10893         if (state->dts_formats != NULL) {
10894                 ASSERT(ndx != 0);
10895                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10896                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10897         }
10898 
10899         state->dts_formats = new;
10900         state->dts_formats[ndx] = fmt;
10901 
10902         return (ndx + 1);
10903 }
10904 
10905 static void
10906 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10907 {
10908         char *fmt;
10909 
10910         ASSERT(state->dts_formats != NULL);
10911         ASSERT(format <= state->dts_nformats);
10912         ASSERT(state->dts_formats[format - 1] != NULL);
10913 
10914         fmt = state->dts_formats[format - 1];
10915         kmem_free(fmt, strlen(fmt) + 1);
10916         state->dts_formats[format - 1] = NULL;
10917 }
10918 
10919 static void
10920 dtrace_format_destroy(dtrace_state_t *state)
10921 {
10922         int i;
10923 
10924         if (state->dts_nformats == 0) {
10925                 ASSERT(state->dts_formats == NULL);
10926                 return;
10927         }
10928 
10929         ASSERT(state->dts_formats != NULL);
10930 
10931         for (i = 0; i < state->dts_nformats; i++) {
10932                 char *fmt = state->dts_formats[i];
10933 
10934                 if (fmt == NULL)
10935                         continue;
10936 
10937                 kmem_free(fmt, strlen(fmt) + 1);
10938         }
10939 
10940         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10941         state->dts_nformats = 0;
10942         state->dts_formats = NULL;
10943 }
10944 
10945 /*
10946  * DTrace Predicate Functions
10947  */
10948 static dtrace_predicate_t *
10949 dtrace_predicate_create(dtrace_difo_t *dp)
10950 {
10951         dtrace_predicate_t *pred;
10952 
10953         ASSERT(MUTEX_HELD(&dtrace_lock));
10954         ASSERT(dp->dtdo_refcnt != 0);
10955 
10956         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10957         pred->dtp_difo = dp;
10958         pred->dtp_refcnt = 1;
10959 
10960         if (!dtrace_difo_cacheable(dp))
10961                 return (pred);
10962 
10963         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10964                 /*
10965                  * This is only theoretically possible -- we have had 2^32
10966                  * cacheable predicates on this machine.  We cannot allow any
10967                  * more predicates to become cacheable:  as unlikely as it is,
10968                  * there may be a thread caching a (now stale) predicate cache
10969                  * ID. (N.B.: the temptation is being successfully resisted to
10970                  * have this cmn_err() "Holy shit -- we executed this code!")
10971                  */
10972                 return (pred);
10973         }
10974 
10975         pred->dtp_cacheid = dtrace_predcache_id++;
10976 
10977         return (pred);
10978 }
10979 
10980 static void
10981 dtrace_predicate_hold(dtrace_predicate_t *pred)
10982 {
10983         ASSERT(MUTEX_HELD(&dtrace_lock));
10984         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10985         ASSERT(pred->dtp_refcnt > 0);
10986 
10987         pred->dtp_refcnt++;
10988 }
10989 
10990 static void
10991 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10992 {
10993         dtrace_difo_t *dp = pred->dtp_difo;
10994 
10995         ASSERT(MUTEX_HELD(&dtrace_lock));
10996         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10997         ASSERT(pred->dtp_refcnt > 0);
10998 
10999         if (--pred->dtp_refcnt == 0) {
11000                 dtrace_difo_release(pred->dtp_difo, vstate);
11001                 kmem_free(pred, sizeof (dtrace_predicate_t));
11002         }
11003 }
11004 
11005 /*
11006  * DTrace Action Description Functions
11007  */
11008 static dtrace_actdesc_t *
11009 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
11010     uint64_t uarg, uint64_t arg)
11011 {
11012         dtrace_actdesc_t *act;
11013 
11014 #ifdef illumos
11015         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
11016             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
11017 #endif
11018 
11019         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
11020         act->dtad_kind = kind;
11021         act->dtad_ntuple = ntuple;
11022         act->dtad_uarg = uarg;
11023         act->dtad_arg = arg;
11024         act->dtad_refcnt = 1;
11025 
11026         return (act);
11027 }
11028 
11029 static void
11030 dtrace_actdesc_hold(dtrace_actdesc_t *act)
11031 {
11032         ASSERT(act->dtad_refcnt >= 1);
11033         act->dtad_refcnt++;
11034 }
11035 
11036 static void
11037 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
11038 {
11039         dtrace_actkind_t kind = act->dtad_kind;
11040         dtrace_difo_t *dp;
11041 
11042         ASSERT(act->dtad_refcnt >= 1);
11043 
11044         if (--act->dtad_refcnt != 0)
11045                 return;
11046 
11047         if ((dp = act->dtad_difo) != NULL)
11048                 dtrace_difo_release(dp, vstate);
11049 
11050         if (DTRACEACT_ISPRINTFLIKE(kind)) {
11051                 char *str = (char *)(uintptr_t)act->dtad_arg;
11052 
11053 #ifdef illumos
11054                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
11055                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
11056 #endif
11057 
11058                 if (str != NULL)
11059                         kmem_free(str, strlen(str) + 1);
11060         }
11061 
11062         kmem_free(act, sizeof (dtrace_actdesc_t));
11063 }
11064 
11065 /*
11066  * DTrace ECB Functions
11067  */
11068 static dtrace_ecb_t *
11069 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
11070 {
11071         dtrace_ecb_t *ecb;
11072         dtrace_epid_t epid;
11073 
11074         ASSERT(MUTEX_HELD(&dtrace_lock));
11075 
11076         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
11077         ecb->dte_predicate = NULL;
11078         ecb->dte_probe = probe;
11079 
11080         /*
11081          * The default size is the size of the default action: recording
11082          * the header.
11083          */
11084         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
11085         ecb->dte_alignment = sizeof (dtrace_epid_t);
11086 
11087         epid = state->dts_epid++;
11088 
11089         if (epid - 1 >= state->dts_necbs) {
11090                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
11091                 int necbs = state->dts_necbs << 1;
11092 
11093                 ASSERT(epid == state->dts_necbs + 1);
11094 
11095                 if (necbs == 0) {
11096                         ASSERT(oecbs == NULL);
11097                         necbs = 1;
11098                 }
11099 
11100                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
11101 
11102                 if (oecbs != NULL)
11103                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
11104 
11105                 dtrace_membar_producer();
11106                 state->dts_ecbs = ecbs;
11107 
11108                 if (oecbs != NULL) {
11109                         /*
11110                          * If this state is active, we must dtrace_sync()
11111                          * before we can free the old dts_ecbs array:  we're
11112                          * coming in hot, and there may be active ring
11113                          * buffer processing (which indexes into the dts_ecbs
11114                          * array) on another CPU.
11115                          */
11116                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11117                                 dtrace_sync();
11118 
11119                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11120                 }
11121 
11122                 dtrace_membar_producer();
11123                 state->dts_necbs = necbs;
11124         }
11125 
11126         ecb->dte_state = state;
11127 
11128         ASSERT(state->dts_ecbs[epid - 1] == NULL);
11129         dtrace_membar_producer();
11130         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
11131 
11132         return (ecb);
11133 }
11134 
11135 static void
11136 dtrace_ecb_enable(dtrace_ecb_t *ecb)
11137 {
11138         dtrace_probe_t *probe = ecb->dte_probe;
11139 
11140         ASSERT(MUTEX_HELD(&cpu_lock));
11141         ASSERT(MUTEX_HELD(&dtrace_lock));
11142         ASSERT(ecb->dte_next == NULL);
11143 
11144         if (probe == NULL) {
11145                 /*
11146                  * This is the NULL probe -- there's nothing to do.
11147                  */
11148                 return;
11149         }
11150 
11151         if (probe->dtpr_ecb == NULL) {
11152                 dtrace_provider_t *prov = probe->dtpr_provider;
11153 
11154                 /*
11155                  * We're the first ECB on this probe.
11156                  */
11157                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11158 
11159                 if (ecb->dte_predicate != NULL)
11160                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11161 
11162                 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11163                     probe->dtpr_id, probe->dtpr_arg);
11164         } else {
11165                 /*
11166                  * This probe is already active.  Swing the last pointer to
11167                  * point to the new ECB, and issue a dtrace_sync() to assure
11168                  * that all CPUs have seen the change.
11169                  */
11170                 ASSERT(probe->dtpr_ecb_last != NULL);
11171                 probe->dtpr_ecb_last->dte_next = ecb;
11172                 probe->dtpr_ecb_last = ecb;
11173                 probe->dtpr_predcache = 0;
11174 
11175                 dtrace_sync();
11176         }
11177 }
11178 
11179 static int
11180 dtrace_ecb_resize(dtrace_ecb_t *ecb)
11181 {
11182         dtrace_action_t *act;
11183         uint32_t curneeded = UINT32_MAX;
11184         uint32_t aggbase = UINT32_MAX;
11185 
11186         /*
11187          * If we record anything, we always record the dtrace_rechdr_t.  (And
11188          * we always record it first.)
11189          */
11190         ecb->dte_size = sizeof (dtrace_rechdr_t);
11191         ecb->dte_alignment = sizeof (dtrace_epid_t);
11192 
11193         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11194                 dtrace_recdesc_t *rec = &act->dta_rec;
11195                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
11196 
11197                 ecb->dte_alignment = MAX(ecb->dte_alignment,
11198                     rec->dtrd_alignment);
11199 
11200                 if (DTRACEACT_ISAGG(act->dta_kind)) {
11201                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11202 
11203                         ASSERT(rec->dtrd_size != 0);
11204                         ASSERT(agg->dtag_first != NULL);
11205                         ASSERT(act->dta_prev->dta_intuple);
11206                         ASSERT(aggbase != UINT32_MAX);
11207                         ASSERT(curneeded != UINT32_MAX);
11208 
11209                         agg->dtag_base = aggbase;
11210 
11211                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11212                         rec->dtrd_offset = curneeded;
11213                         if (curneeded + rec->dtrd_size < curneeded)
11214                                 return (EINVAL);
11215                         curneeded += rec->dtrd_size;
11216                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11217 
11218                         aggbase = UINT32_MAX;
11219                         curneeded = UINT32_MAX;
11220                 } else if (act->dta_intuple) {
11221                         if (curneeded == UINT32_MAX) {
11222                                 /*
11223                                  * This is the first record in a tuple.  Align
11224                                  * curneeded to be at offset 4 in an 8-byte
11225                                  * aligned block.
11226                                  */
11227                                 ASSERT(act->dta_prev == NULL ||
11228                                     !act->dta_prev->dta_intuple);
11229                                 ASSERT3U(aggbase, ==, UINT32_MAX);
11230                                 curneeded = P2PHASEUP(ecb->dte_size,
11231                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
11232 
11233                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
11234                                 ASSERT(IS_P2ALIGNED(aggbase,
11235                                     sizeof (uint64_t)));
11236                         }
11237                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11238                         rec->dtrd_offset = curneeded;
11239                         if (curneeded + rec->dtrd_size < curneeded)
11240                                 return (EINVAL);
11241                         curneeded += rec->dtrd_size;
11242                 } else {
11243                         /* tuples must be followed by an aggregation */
11244                         ASSERT(act->dta_prev == NULL ||
11245                             !act->dta_prev->dta_intuple);
11246 
11247                         ecb->dte_size = P2ROUNDUP(ecb->dte_size,
11248                             rec->dtrd_alignment);
11249                         rec->dtrd_offset = ecb->dte_size;
11250                         if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11251                                 return (EINVAL);
11252                         ecb->dte_size += rec->dtrd_size;
11253                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11254                 }
11255         }
11256 
11257         if ((act = ecb->dte_action) != NULL &&
11258             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11259             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11260                 /*
11261                  * If the size is still sizeof (dtrace_rechdr_t), then all
11262                  * actions store no data; set the size to 0.
11263                  */
11264                 ecb->dte_size = 0;
11265         }
11266 
11267         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11268         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11269         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
11270             ecb->dte_needed);
11271         return (0);
11272 }
11273 
11274 static dtrace_action_t *
11275 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11276 {
11277         dtrace_aggregation_t *agg;
11278         size_t size = sizeof (uint64_t);
11279         int ntuple = desc->dtad_ntuple;
11280         dtrace_action_t *act;
11281         dtrace_recdesc_t *frec;
11282         dtrace_aggid_t aggid;
11283         dtrace_state_t *state = ecb->dte_state;
11284 
11285         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11286         agg->dtag_ecb = ecb;
11287 
11288         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11289 
11290         switch (desc->dtad_kind) {
11291         case DTRACEAGG_MIN:
11292                 agg->dtag_initial = INT64_MAX;
11293                 agg->dtag_aggregate = dtrace_aggregate_min;
11294                 break;
11295 
11296         case DTRACEAGG_MAX:
11297                 agg->dtag_initial = INT64_MIN;
11298                 agg->dtag_aggregate = dtrace_aggregate_max;
11299                 break;
11300 
11301         case DTRACEAGG_COUNT:
11302                 agg->dtag_aggregate = dtrace_aggregate_count;
11303                 break;
11304 
11305         case DTRACEAGG_QUANTIZE:
11306                 agg->dtag_aggregate = dtrace_aggregate_quantize;
11307                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11308                     sizeof (uint64_t);
11309                 break;
11310 
11311         case DTRACEAGG_LQUANTIZE: {
11312                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11313                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11314 
11315                 agg->dtag_initial = desc->dtad_arg;
11316                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11317 
11318                 if (step == 0 || levels == 0)
11319                         goto err;
11320 
11321                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11322                 break;
11323         }
11324 
11325         case DTRACEAGG_LLQUANTIZE: {
11326                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11327                 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11328                 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11329                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11330                 int64_t v;
11331 
11332                 agg->dtag_initial = desc->dtad_arg;
11333                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11334 
11335                 if (factor < 2 || low >= high || nsteps < factor)
11336                         goto err;
11337 
11338                 /*
11339                  * Now check that the number of steps evenly divides a power
11340                  * of the factor.  (This assures both integer bucket size and
11341                  * linearity within each magnitude.)
11342                  */
11343                 for (v = factor; v < nsteps; v *= factor)
11344                         continue;
11345 
11346                 if ((v % nsteps) || (nsteps % factor))
11347                         goto err;
11348 
11349                 size = (dtrace_aggregate_llquantize_bucket(factor,
11350                     low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11351                 break;
11352         }
11353 
11354         case DTRACEAGG_AVG:
11355                 agg->dtag_aggregate = dtrace_aggregate_avg;
11356                 size = sizeof (uint64_t) * 2;
11357                 break;
11358 
11359         case DTRACEAGG_STDDEV:
11360                 agg->dtag_aggregate = dtrace_aggregate_stddev;
11361                 size = sizeof (uint64_t) * 4;
11362                 break;
11363 
11364         case DTRACEAGG_SUM:
11365                 agg->dtag_aggregate = dtrace_aggregate_sum;
11366                 break;
11367 
11368         default:
11369                 goto err;
11370         }
11371 
11372         agg->dtag_action.dta_rec.dtrd_size = size;
11373 
11374         if (ntuple == 0)
11375                 goto err;
11376 
11377         /*
11378          * We must make sure that we have enough actions for the n-tuple.
11379          */
11380         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11381                 if (DTRACEACT_ISAGG(act->dta_kind))
11382                         break;
11383 
11384                 if (--ntuple == 0) {
11385                         /*
11386                          * This is the action with which our n-tuple begins.
11387                          */
11388                         agg->dtag_first = act;
11389                         goto success;
11390                 }
11391         }
11392 
11393         /*
11394          * This n-tuple is short by ntuple elements.  Return failure.
11395          */
11396         ASSERT(ntuple != 0);
11397 err:
11398         kmem_free(agg, sizeof (dtrace_aggregation_t));
11399         return (NULL);
11400 
11401 success:
11402         /*
11403          * If the last action in the tuple has a size of zero, it's actually
11404          * an expression argument for the aggregating action.
11405          */
11406         ASSERT(ecb->dte_action_last != NULL);
11407         act = ecb->dte_action_last;
11408 
11409         if (act->dta_kind == DTRACEACT_DIFEXPR) {
11410                 ASSERT(act->dta_difo != NULL);
11411 
11412                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11413                         agg->dtag_hasarg = 1;
11414         }
11415 
11416         /*
11417          * We need to allocate an id for this aggregation.
11418          */
11419 #ifdef illumos
11420         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11421             VM_BESTFIT | VM_SLEEP);
11422 #else
11423         aggid = alloc_unr(state->dts_aggid_arena);
11424 #endif
11425 
11426         if (aggid - 1 >= state->dts_naggregations) {
11427                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11428                 dtrace_aggregation_t **aggs;
11429                 int naggs = state->dts_naggregations << 1;
11430                 int onaggs = state->dts_naggregations;
11431 
11432                 ASSERT(aggid == state->dts_naggregations + 1);
11433 
11434                 if (naggs == 0) {
11435                         ASSERT(oaggs == NULL);
11436                         naggs = 1;
11437                 }
11438 
11439                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11440 
11441                 if (oaggs != NULL) {
11442                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11443                         kmem_free(oaggs, onaggs * sizeof (*aggs));
11444                 }
11445 
11446                 state->dts_aggregations = aggs;
11447                 state->dts_naggregations = naggs;
11448         }
11449 
11450         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11451         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11452 
11453         frec = &agg->dtag_first->dta_rec;
11454         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11455                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11456 
11457         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11458                 ASSERT(!act->dta_intuple);
11459                 act->dta_intuple = 1;
11460         }
11461 
11462         return (&agg->dtag_action);
11463 }
11464 
11465 static void
11466 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11467 {
11468         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11469         dtrace_state_t *state = ecb->dte_state;
11470         dtrace_aggid_t aggid = agg->dtag_id;
11471 
11472         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11473 #ifdef illumos
11474         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11475 #else
11476         free_unr(state->dts_aggid_arena, aggid);
11477 #endif
11478 
11479         ASSERT(state->dts_aggregations[aggid - 1] == agg);
11480         state->dts_aggregations[aggid - 1] = NULL;
11481 
11482         kmem_free(agg, sizeof (dtrace_aggregation_t));
11483 }
11484 
11485 static int
11486 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11487 {
11488         dtrace_action_t *action, *last;
11489         dtrace_difo_t *dp = desc->dtad_difo;
11490         uint32_t size = 0, align = sizeof (uint8_t), mask;
11491         uint16_t format = 0;
11492         dtrace_recdesc_t *rec;
11493         dtrace_state_t *state = ecb->dte_state;
11494         dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11495         uint64_t arg = desc->dtad_arg;
11496 
11497         ASSERT(MUTEX_HELD(&dtrace_lock));
11498         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11499 
11500         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11501                 /*
11502                  * If this is an aggregating action, there must be neither
11503                  * a speculate nor a commit on the action chain.
11504                  */
11505                 dtrace_action_t *act;
11506 
11507                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11508                         if (act->dta_kind == DTRACEACT_COMMIT)
11509                                 return (EINVAL);
11510 
11511                         if (act->dta_kind == DTRACEACT_SPECULATE)
11512                                 return (EINVAL);
11513                 }
11514 
11515                 action = dtrace_ecb_aggregation_create(ecb, desc);
11516 
11517                 if (action == NULL)
11518                         return (EINVAL);
11519         } else {
11520                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11521                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11522                     dp != NULL && dp->dtdo_destructive)) {
11523                         state->dts_destructive = 1;
11524                 }
11525 
11526                 switch (desc->dtad_kind) {
11527                 case DTRACEACT_PRINTF:
11528                 case DTRACEACT_PRINTA:
11529                 case DTRACEACT_SYSTEM:
11530                 case DTRACEACT_FREOPEN:
11531                 case DTRACEACT_DIFEXPR:
11532                         /*
11533                          * We know that our arg is a string -- turn it into a
11534                          * format.
11535                          */
11536                         if (arg == 0) {
11537                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11538                                     desc->dtad_kind == DTRACEACT_DIFEXPR);
11539                                 format = 0;
11540                         } else {
11541                                 ASSERT(arg != 0);
11542 #ifdef illumos
11543                                 ASSERT(arg > KERNELBASE);
11544 #endif
11545                                 format = dtrace_format_add(state,
11546                                     (char *)(uintptr_t)arg);
11547                         }
11548 
11549                         /*FALLTHROUGH*/
11550                 case DTRACEACT_LIBACT:
11551                 case DTRACEACT_TRACEMEM:
11552                 case DTRACEACT_TRACEMEM_DYNSIZE:
11553                         if (dp == NULL)
11554                                 return (EINVAL);
11555 
11556                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11557                                 break;
11558 
11559                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11560                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11561                                         return (EINVAL);
11562 
11563                                 size = opt[DTRACEOPT_STRSIZE];
11564                         }
11565 
11566                         break;
11567 
11568                 case DTRACEACT_STACK:
11569                         if ((nframes = arg) == 0) {
11570                                 nframes = opt[DTRACEOPT_STACKFRAMES];
11571                                 ASSERT(nframes > 0);
11572                                 arg = nframes;
11573                         }
11574 
11575                         size = nframes * sizeof (pc_t);
11576                         break;
11577 
11578                 case DTRACEACT_JSTACK:
11579                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11580                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11581 
11582                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11583                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11584 
11585                         arg = DTRACE_USTACK_ARG(nframes, strsize);
11586 
11587                         /*FALLTHROUGH*/
11588                 case DTRACEACT_USTACK:
11589                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
11590                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11591                                 strsize = DTRACE_USTACK_STRSIZE(arg);
11592                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
11593                                 ASSERT(nframes > 0);
11594                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
11595                         }
11596 
11597                         /*
11598                          * Save a slot for the pid.
11599                          */
11600                         size = (nframes + 1) * sizeof (uint64_t);
11601                         size += DTRACE_USTACK_STRSIZE(arg);
11602                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11603 
11604                         break;
11605 
11606                 case DTRACEACT_SYM:
11607                 case DTRACEACT_MOD:
11608                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11609                             sizeof (uint64_t)) ||
11610                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11611                                 return (EINVAL);
11612                         break;
11613 
11614                 case DTRACEACT_USYM:
11615                 case DTRACEACT_UMOD:
11616                 case DTRACEACT_UADDR:
11617                         if (dp == NULL ||
11618                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11619                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11620                                 return (EINVAL);
11621 
11622                         /*
11623                          * We have a slot for the pid, plus a slot for the
11624                          * argument.  To keep things simple (aligned with
11625                          * bitness-neutral sizing), we store each as a 64-bit
11626                          * quantity.
11627                          */
11628                         size = 2 * sizeof (uint64_t);
11629                         break;
11630 
11631                 case DTRACEACT_STOP:
11632                 case DTRACEACT_BREAKPOINT:
11633                 case DTRACEACT_PANIC:
11634                         break;
11635 
11636                 case DTRACEACT_CHILL:
11637                 case DTRACEACT_DISCARD:
11638                 case DTRACEACT_RAISE:
11639                         if (dp == NULL)
11640                                 return (EINVAL);
11641                         break;
11642 
11643                 case DTRACEACT_EXIT:
11644                         if (dp == NULL ||
11645                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11646                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11647                                 return (EINVAL);
11648                         break;
11649 
11650                 case DTRACEACT_SPECULATE:
11651                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11652                                 return (EINVAL);
11653 
11654                         if (dp == NULL)
11655                                 return (EINVAL);
11656 
11657                         state->dts_speculates = 1;
11658                         break;
11659 
11660                 case DTRACEACT_PRINTM:
11661                         size = dp->dtdo_rtype.dtdt_size;
11662                         break;
11663 
11664                 case DTRACEACT_COMMIT: {
11665                         dtrace_action_t *act = ecb->dte_action;
11666 
11667                         for (; act != NULL; act = act->dta_next) {
11668                                 if (act->dta_kind == DTRACEACT_COMMIT)
11669                                         return (EINVAL);
11670                         }
11671 
11672                         if (dp == NULL)
11673                                 return (EINVAL);
11674                         break;
11675                 }
11676 
11677                 default:
11678                         return (EINVAL);
11679                 }
11680 
11681                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11682                         /*
11683                          * If this is a data-storing action or a speculate,
11684                          * we must be sure that there isn't a commit on the
11685                          * action chain.
11686                          */
11687                         dtrace_action_t *act = ecb->dte_action;
11688 
11689                         for (; act != NULL; act = act->dta_next) {
11690                                 if (act->dta_kind == DTRACEACT_COMMIT)
11691                                         return (EINVAL);
11692                         }
11693                 }
11694 
11695                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11696                 action->dta_rec.dtrd_size = size;
11697         }
11698 
11699         action->dta_refcnt = 1;
11700         rec = &action->dta_rec;
11701         size = rec->dtrd_size;
11702 
11703         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11704                 if (!(size & mask)) {
11705                         align = mask + 1;
11706                         break;
11707                 }
11708         }
11709 
11710         action->dta_kind = desc->dtad_kind;
11711 
11712         if ((action->dta_difo = dp) != NULL)
11713                 dtrace_difo_hold(dp);
11714 
11715         rec->dtrd_action = action->dta_kind;
11716         rec->dtrd_arg = arg;
11717         rec->dtrd_uarg = desc->dtad_uarg;
11718         rec->dtrd_alignment = (uint16_t)align;
11719         rec->dtrd_format = format;
11720 
11721         if ((last = ecb->dte_action_last) != NULL) {
11722                 ASSERT(ecb->dte_action != NULL);
11723                 action->dta_prev = last;
11724                 last->dta_next = action;
11725         } else {
11726                 ASSERT(ecb->dte_action == NULL);
11727                 ecb->dte_action = action;
11728         }
11729 
11730         ecb->dte_action_last = action;
11731 
11732         return (0);
11733 }
11734 
11735 static void
11736 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11737 {
11738         dtrace_action_t *act = ecb->dte_action, *next;
11739         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11740         dtrace_difo_t *dp;
11741         uint16_t format;
11742 
11743         if (act != NULL && act->dta_refcnt > 1) {
11744                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11745                 act->dta_refcnt--;
11746         } else {
11747                 for (; act != NULL; act = next) {
11748                         next = act->dta_next;
11749                         ASSERT(next != NULL || act == ecb->dte_action_last);
11750                         ASSERT(act->dta_refcnt == 1);
11751 
11752                         if ((format = act->dta_rec.dtrd_format) != 0)
11753                                 dtrace_format_remove(ecb->dte_state, format);
11754 
11755                         if ((dp = act->dta_difo) != NULL)
11756                                 dtrace_difo_release(dp, vstate);
11757 
11758                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11759                                 dtrace_ecb_aggregation_destroy(ecb, act);
11760                         } else {
11761                                 kmem_free(act, sizeof (dtrace_action_t));
11762                         }
11763                 }
11764         }
11765 
11766         ecb->dte_action = NULL;
11767         ecb->dte_action_last = NULL;
11768         ecb->dte_size = 0;
11769 }
11770 
11771 static void
11772 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11773 {
11774         /*
11775          * We disable the ECB by removing it from its probe.
11776          */
11777         dtrace_ecb_t *pecb, *prev = NULL;
11778         dtrace_probe_t *probe = ecb->dte_probe;
11779 
11780         ASSERT(MUTEX_HELD(&dtrace_lock));
11781 
11782         if (probe == NULL) {
11783                 /*
11784                  * This is the NULL probe; there is nothing to disable.
11785                  */
11786                 return;
11787         }
11788 
11789         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11790                 if (pecb == ecb)
11791                         break;
11792                 prev = pecb;
11793         }
11794 
11795         ASSERT(pecb != NULL);
11796 
11797         if (prev == NULL) {
11798                 probe->dtpr_ecb = ecb->dte_next;
11799         } else {
11800                 prev->dte_next = ecb->dte_next;
11801         }
11802 
11803         if (ecb == probe->dtpr_ecb_last) {
11804                 ASSERT(ecb->dte_next == NULL);
11805                 probe->dtpr_ecb_last = prev;
11806         }
11807 
11808         /*
11809          * The ECB has been disconnected from the probe; now sync to assure
11810          * that all CPUs have seen the change before returning.
11811          */
11812         dtrace_sync();
11813 
11814         if (probe->dtpr_ecb == NULL) {
11815                 /*
11816                  * That was the last ECB on the probe; clear the predicate
11817                  * cache ID for the probe, disable it and sync one more time
11818                  * to assure that we'll never hit it again.
11819                  */
11820                 dtrace_provider_t *prov = probe->dtpr_provider;
11821 
11822                 ASSERT(ecb->dte_next == NULL);
11823                 ASSERT(probe->dtpr_ecb_last == NULL);
11824                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11825                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11826                     probe->dtpr_id, probe->dtpr_arg);
11827                 dtrace_sync();
11828         } else {
11829                 /*
11830                  * There is at least one ECB remaining on the probe.  If there
11831                  * is _exactly_ one, set the probe's predicate cache ID to be
11832                  * the predicate cache ID of the remaining ECB.
11833                  */
11834                 ASSERT(probe->dtpr_ecb_last != NULL);
11835                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11836 
11837                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11838                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11839 
11840                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11841 
11842                         if (p != NULL)
11843                                 probe->dtpr_predcache = p->dtp_cacheid;
11844                 }
11845 
11846                 ecb->dte_next = NULL;
11847         }
11848 }
11849 
11850 static void
11851 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11852 {
11853         dtrace_state_t *state = ecb->dte_state;
11854         dtrace_vstate_t *vstate = &state->dts_vstate;
11855         dtrace_predicate_t *pred;
11856         dtrace_epid_t epid = ecb->dte_epid;
11857 
11858         ASSERT(MUTEX_HELD(&dtrace_lock));
11859         ASSERT(ecb->dte_next == NULL);
11860         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11861 
11862         if ((pred = ecb->dte_predicate) != NULL)
11863                 dtrace_predicate_release(pred, vstate);
11864 
11865         dtrace_ecb_action_remove(ecb);
11866 
11867         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11868         state->dts_ecbs[epid - 1] = NULL;
11869 
11870         kmem_free(ecb, sizeof (dtrace_ecb_t));
11871 }
11872 
11873 static dtrace_ecb_t *
11874 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11875     dtrace_enabling_t *enab)
11876 {
11877         dtrace_ecb_t *ecb;
11878         dtrace_predicate_t *pred;
11879         dtrace_actdesc_t *act;
11880         dtrace_provider_t *prov;
11881         dtrace_ecbdesc_t *desc = enab->dten_current;
11882 
11883         ASSERT(MUTEX_HELD(&dtrace_lock));
11884         ASSERT(state != NULL);
11885 
11886         ecb = dtrace_ecb_add(state, probe);
11887         ecb->dte_uarg = desc->dted_uarg;
11888 
11889         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11890                 dtrace_predicate_hold(pred);
11891                 ecb->dte_predicate = pred;
11892         }
11893 
11894         if (probe != NULL) {
11895                 /*
11896                  * If the provider shows more leg than the consumer is old
11897                  * enough to see, we need to enable the appropriate implicit
11898                  * predicate bits to prevent the ecb from activating at
11899                  * revealing times.
11900                  *
11901                  * Providers specifying DTRACE_PRIV_USER at register time
11902                  * are stating that they need the /proc-style privilege
11903                  * model to be enforced, and this is what DTRACE_COND_OWNER
11904                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11905                  */
11906                 prov = probe->dtpr_provider;
11907                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11908                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11909                         ecb->dte_cond |= DTRACE_COND_OWNER;
11910 
11911                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11912                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11913                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11914 
11915                 /*
11916                  * If the provider shows us kernel innards and the user
11917                  * is lacking sufficient privilege, enable the
11918                  * DTRACE_COND_USERMODE implicit predicate.
11919                  */
11920                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11921                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11922                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11923         }
11924 
11925         if (dtrace_ecb_create_cache != NULL) {
11926                 /*
11927                  * If we have a cached ecb, we'll use its action list instead
11928                  * of creating our own (saving both time and space).
11929                  */
11930                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11931                 dtrace_action_t *act = cached->dte_action;
11932 
11933                 if (act != NULL) {
11934                         ASSERT(act->dta_refcnt > 0);
11935                         act->dta_refcnt++;
11936                         ecb->dte_action = act;
11937                         ecb->dte_action_last = cached->dte_action_last;
11938                         ecb->dte_needed = cached->dte_needed;
11939                         ecb->dte_size = cached->dte_size;
11940                         ecb->dte_alignment = cached->dte_alignment;
11941                 }
11942 
11943                 return (ecb);
11944         }
11945 
11946         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11947                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11948                         dtrace_ecb_destroy(ecb);
11949                         return (NULL);
11950                 }
11951         }
11952 
11953         if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11954                 dtrace_ecb_destroy(ecb);
11955                 return (NULL);
11956         }
11957 
11958         return (dtrace_ecb_create_cache = ecb);
11959 }
11960 
11961 static int
11962 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11963 {
11964         dtrace_ecb_t *ecb;
11965         dtrace_enabling_t *enab = arg;
11966         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11967 
11968         ASSERT(state != NULL);
11969 
11970         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11971                 /*
11972                  * This probe was created in a generation for which this
11973                  * enabling has previously created ECBs; we don't want to
11974                  * enable it again, so just kick out.
11975                  */
11976                 return (DTRACE_MATCH_NEXT);
11977         }
11978 
11979         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11980                 return (DTRACE_MATCH_DONE);
11981 
11982         dtrace_ecb_enable(ecb);
11983         return (DTRACE_MATCH_NEXT);
11984 }
11985 
11986 static dtrace_ecb_t *
11987 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11988 {
11989         dtrace_ecb_t *ecb;
11990 
11991         ASSERT(MUTEX_HELD(&dtrace_lock));
11992 
11993         if (id == 0 || id > state->dts_necbs)
11994                 return (NULL);
11995 
11996         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11997         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11998 
11999         return (state->dts_ecbs[id - 1]);
12000 }
12001 
12002 static dtrace_aggregation_t *
12003 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
12004 {
12005         dtrace_aggregation_t *agg;
12006 
12007         ASSERT(MUTEX_HELD(&dtrace_lock));
12008 
12009         if (id == 0 || id > state->dts_naggregations)
12010                 return (NULL);
12011 
12012         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
12013         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
12014             agg->dtag_id == id);
12015 
12016         return (state->dts_aggregations[id - 1]);
12017 }
12018 
12019 /*
12020  * DTrace Buffer Functions
12021  *
12022  * The following functions manipulate DTrace buffers.  Most of these functions
12023  * are called in the context of establishing or processing consumer state;
12024  * exceptions are explicitly noted.
12025  */
12026 
12027 /*
12028  * Note:  called from cross call context.  This function switches the two
12029  * buffers on a given CPU.  The atomicity of this operation is assured by
12030  * disabling interrupts while the actual switch takes place; the disabling of
12031  * interrupts serializes the execution with any execution of dtrace_probe() on
12032  * the same CPU.
12033  */
12034 static void
12035 dtrace_buffer_switch(dtrace_buffer_t *buf)
12036 {
12037         caddr_t tomax = buf->dtb_tomax;
12038         caddr_t xamot = buf->dtb_xamot;
12039         dtrace_icookie_t cookie;
12040         hrtime_t now;
12041 
12042         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12043         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
12044 
12045         cookie = dtrace_interrupt_disable();
12046         now = dtrace_gethrtime();
12047         buf->dtb_tomax = xamot;
12048         buf->dtb_xamot = tomax;
12049         buf->dtb_xamot_drops = buf->dtb_drops;
12050         buf->dtb_xamot_offset = buf->dtb_offset;
12051         buf->dtb_xamot_errors = buf->dtb_errors;
12052         buf->dtb_xamot_flags = buf->dtb_flags;
12053         buf->dtb_offset = 0;
12054         buf->dtb_drops = 0;
12055         buf->dtb_errors = 0;
12056         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
12057         buf->dtb_interval = now - buf->dtb_switched;
12058         buf->dtb_switched = now;
12059         dtrace_interrupt_enable(cookie);
12060 }
12061 
12062 /*
12063  * Note:  called from cross call context.  This function activates a buffer
12064  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
12065  * is guaranteed by the disabling of interrupts.
12066  */
12067 static void
12068 dtrace_buffer_activate(dtrace_state_t *state)
12069 {
12070         dtrace_buffer_t *buf;
12071         dtrace_icookie_t cookie = dtrace_interrupt_disable();
12072 
12073         buf = &state->dts_buffer[curcpu];
12074 
12075         if (buf->dtb_tomax != NULL) {
12076                 /*
12077                  * We might like to assert that the buffer is marked inactive,
12078                  * but this isn't necessarily true:  the buffer for the CPU
12079                  * that processes the BEGIN probe has its buffer activated
12080                  * manually.  In this case, we take the (harmless) action
12081                  * re-clearing the bit INACTIVE bit.
12082                  */
12083                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
12084         }
12085 
12086         dtrace_interrupt_enable(cookie);
12087 }
12088 
12089 #ifdef __FreeBSD__
12090 /*
12091  * Activate the specified per-CPU buffer.  This is used instead of
12092  * dtrace_buffer_activate() when APs have not yet started, i.e. when
12093  * activating anonymous state.
12094  */
12095 static void
12096 dtrace_buffer_activate_cpu(dtrace_state_t *state, int cpu)
12097 {
12098 
12099         if (state->dts_buffer[cpu].dtb_tomax != NULL)
12100                 state->dts_buffer[cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
12101 }
12102 #endif
12103 
12104 static int
12105 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
12106     processorid_t cpu, int *factor)
12107 {
12108 #ifdef illumos
12109         cpu_t *cp;
12110 #endif
12111         dtrace_buffer_t *buf;
12112         int allocated = 0, desired = 0;
12113 
12114 #ifdef illumos
12115         ASSERT(MUTEX_HELD(&cpu_lock));
12116         ASSERT(MUTEX_HELD(&dtrace_lock));
12117 
12118         *factor = 1;
12119 
12120         if (size > dtrace_nonroot_maxsize &&
12121             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12122                 return (EFBIG);
12123 
12124         cp = cpu_list;
12125 
12126         do {
12127                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12128                         continue;
12129 
12130                 buf = &bufs[cp->cpu_id];
12131 
12132                 /*
12133                  * If there is already a buffer allocated for this CPU, it
12134                  * is only possible that this is a DR event.  In this case,
12135                  */
12136                 if (buf->dtb_tomax != NULL) {
12137                         ASSERT(buf->dtb_size == size);
12138                         continue;
12139                 }
12140 
12141                 ASSERT(buf->dtb_xamot == NULL);
12142 
12143                 if ((buf->dtb_tomax = kmem_zalloc(size,
12144                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12145                         goto err;
12146 
12147                 buf->dtb_size = size;
12148                 buf->dtb_flags = flags;
12149                 buf->dtb_offset = 0;
12150                 buf->dtb_drops = 0;
12151 
12152                 if (flags & DTRACEBUF_NOSWITCH)
12153                         continue;
12154 
12155                 if ((buf->dtb_xamot = kmem_zalloc(size,
12156                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12157                         goto err;
12158         } while ((cp = cp->cpu_next) != cpu_list);
12159 
12160         return (0);
12161 
12162 err:
12163         cp = cpu_list;
12164 
12165         do {
12166                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12167                         continue;
12168 
12169                 buf = &bufs[cp->cpu_id];
12170                 desired += 2;
12171 
12172                 if (buf->dtb_xamot != NULL) {
12173                         ASSERT(buf->dtb_tomax != NULL);
12174                         ASSERT(buf->dtb_size == size);
12175                         kmem_free(buf->dtb_xamot, size);
12176                         allocated++;
12177                 }
12178 
12179                 if (buf->dtb_tomax != NULL) {
12180                         ASSERT(buf->dtb_size == size);
12181                         kmem_free(buf->dtb_tomax, size);
12182                         allocated++;
12183                 }
12184 
12185                 buf->dtb_tomax = NULL;
12186                 buf->dtb_xamot = NULL;
12187                 buf->dtb_size = 0;
12188         } while ((cp = cp->cpu_next) != cpu_list);
12189 #else
12190         int i;
12191 
12192         *factor = 1;
12193 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
12194     defined(__mips__) || defined(__powerpc__) || defined(__riscv)
12195         /*
12196          * FreeBSD isn't good at limiting the amount of memory we
12197          * ask to malloc, so let's place a limit here before trying
12198          * to do something that might well end in tears at bedtime.
12199          */
12200         int bufsize_percpu_frac = dtrace_bufsize_max_frac * mp_ncpus;
12201         if (size > physmem * PAGE_SIZE / bufsize_percpu_frac)
12202                 return (ENOMEM);
12203 #endif
12204 
12205         ASSERT(MUTEX_HELD(&dtrace_lock));
12206         CPU_FOREACH(i) {
12207                 if (cpu != DTRACE_CPUALL && cpu != i)
12208                         continue;
12209 
12210                 buf = &bufs[i];
12211 
12212                 /*
12213                  * If there is already a buffer allocated for this CPU, it
12214                  * is only possible that this is a DR event.  In this case,
12215                  * the buffer size must match our specified size.
12216                  */
12217                 if (buf->dtb_tomax != NULL) {
12218                         ASSERT(buf->dtb_size == size);
12219                         continue;
12220                 }
12221 
12222                 ASSERT(buf->dtb_xamot == NULL);
12223 
12224                 if ((buf->dtb_tomax = kmem_zalloc(size,
12225                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12226                         goto err;
12227 
12228                 buf->dtb_size = size;
12229                 buf->dtb_flags = flags;
12230                 buf->dtb_offset = 0;
12231                 buf->dtb_drops = 0;
12232 
12233                 if (flags & DTRACEBUF_NOSWITCH)
12234                         continue;
12235 
12236                 if ((buf->dtb_xamot = kmem_zalloc(size,
12237                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
12238                         goto err;
12239         }
12240 
12241         return (0);
12242 
12243 err:
12244         /*
12245          * Error allocating memory, so free the buffers that were
12246          * allocated before the failed allocation.
12247          */
12248         CPU_FOREACH(i) {
12249                 if (cpu != DTRACE_CPUALL && cpu != i)
12250                         continue;
12251 
12252                 buf = &bufs[i];
12253                 desired += 2;
12254 
12255                 if (buf->dtb_xamot != NULL) {
12256                         ASSERT(buf->dtb_tomax != NULL);
12257                         ASSERT(buf->dtb_size == size);
12258                         kmem_free(buf->dtb_xamot, size);
12259                         allocated++;
12260                 }
12261 
12262                 if (buf->dtb_tomax != NULL) {
12263                         ASSERT(buf->dtb_size == size);
12264                         kmem_free(buf->dtb_tomax, size);
12265                         allocated++;
12266                 }
12267 
12268                 buf->dtb_tomax = NULL;
12269                 buf->dtb_xamot = NULL;
12270                 buf->dtb_size = 0;
12271 
12272         }
12273 #endif
12274         *factor = desired / (allocated > 0 ? allocated : 1);
12275 
12276         return (ENOMEM);
12277 }
12278 
12279 /*
12280  * Note:  called from probe context.  This function just increments the drop
12281  * count on a buffer.  It has been made a function to allow for the
12282  * possibility of understanding the source of mysterious drop counts.  (A
12283  * problem for which one may be particularly disappointed that DTrace cannot
12284  * be used to understand DTrace.)
12285  */
12286 static void
12287 dtrace_buffer_drop(dtrace_buffer_t *buf)
12288 {
12289         buf->dtb_drops++;
12290 }
12291 
12292 /*
12293  * Note:  called from probe context.  This function is called to reserve space
12294  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
12295  * mstate.  Returns the new offset in the buffer, or a negative value if an
12296  * error has occurred.
12297  */
12298 static intptr_t
12299 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12300     dtrace_state_t *state, dtrace_mstate_t *mstate)
12301 {
12302         intptr_t offs = buf->dtb_offset, soffs;
12303         intptr_t woffs;
12304         caddr_t tomax;
12305         size_t total;
12306 
12307         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12308                 return (-1);
12309 
12310         if ((tomax = buf->dtb_tomax) == NULL) {
12311                 dtrace_buffer_drop(buf);
12312                 return (-1);
12313         }
12314 
12315         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12316                 while (offs & (align - 1)) {
12317                         /*
12318                          * Assert that our alignment is off by a number which
12319                          * is itself sizeof (uint32_t) aligned.
12320                          */
12321                         ASSERT(!((align - (offs & (align - 1))) &
12322                             (sizeof (uint32_t) - 1)));
12323                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12324                         offs += sizeof (uint32_t);
12325                 }
12326 
12327                 if ((soffs = offs + needed) > buf->dtb_size) {
12328                         dtrace_buffer_drop(buf);
12329                         return (-1);
12330                 }
12331 
12332                 if (mstate == NULL)
12333                         return (offs);
12334 
12335                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12336                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12337                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12338 
12339                 return (offs);
12340         }
12341 
12342         if (buf->dtb_flags & DTRACEBUF_FILL) {
12343                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12344                     (buf->dtb_flags & DTRACEBUF_FULL))
12345                         return (-1);
12346                 goto out;
12347         }
12348 
12349         total = needed + (offs & (align - 1));
12350 
12351         /*
12352          * For a ring buffer, life is quite a bit more complicated.  Before
12353          * we can store any padding, we need to adjust our wrapping offset.
12354          * (If we've never before wrapped or we're not about to, no adjustment
12355          * is required.)
12356          */
12357         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12358             offs + total > buf->dtb_size) {
12359                 woffs = buf->dtb_xamot_offset;
12360 
12361                 if (offs + total > buf->dtb_size) {
12362                         /*
12363                          * We can't fit in the end of the buffer.  First, a
12364                          * sanity check that we can fit in the buffer at all.
12365                          */
12366                         if (total > buf->dtb_size) {
12367                                 dtrace_buffer_drop(buf);
12368                                 return (-1);
12369                         }
12370 
12371                         /*
12372                          * We're going to be storing at the top of the buffer,
12373                          * so now we need to deal with the wrapped offset.  We
12374                          * only reset our wrapped offset to 0 if it is
12375                          * currently greater than the current offset.  If it
12376                          * is less than the current offset, it is because a
12377                          * previous allocation induced a wrap -- but the
12378                          * allocation didn't subsequently take the space due
12379                          * to an error or false predicate evaluation.  In this
12380                          * case, we'll just leave the wrapped offset alone: if
12381                          * the wrapped offset hasn't been advanced far enough
12382                          * for this allocation, it will be adjusted in the
12383                          * lower loop.
12384                          */
12385                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12386                                 if (woffs >= offs)
12387                                         woffs = 0;
12388                         } else {
12389                                 woffs = 0;
12390                         }
12391 
12392                         /*
12393                          * Now we know that we're going to be storing to the
12394                          * top of the buffer and that there is room for us
12395                          * there.  We need to clear the buffer from the current
12396                          * offset to the end (there may be old gunk there).
12397                          */
12398                         while (offs < buf->dtb_size)
12399                                 tomax[offs++] = 0;
12400 
12401                         /*
12402                          * We need to set our offset to zero.  And because we
12403                          * are wrapping, we need to set the bit indicating as
12404                          * much.  We can also adjust our needed space back
12405                          * down to the space required by the ECB -- we know
12406                          * that the top of the buffer is aligned.
12407                          */
12408                         offs = 0;
12409                         total = needed;
12410                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
12411                 } else {
12412                         /*
12413                          * There is room for us in the buffer, so we simply
12414                          * need to check the wrapped offset.
12415                          */
12416                         if (woffs < offs) {
12417                                 /*
12418                                  * The wrapped offset is less than the offset.
12419                                  * This can happen if we allocated buffer space
12420                                  * that induced a wrap, but then we didn't
12421                                  * subsequently take the space due to an error
12422                                  * or false predicate evaluation.  This is
12423                                  * okay; we know that _this_ allocation isn't
12424                                  * going to induce a wrap.  We still can't
12425                                  * reset the wrapped offset to be zero,
12426                                  * however: the space may have been trashed in
12427                                  * the previous failed probe attempt.  But at
12428                                  * least the wrapped offset doesn't need to
12429                                  * be adjusted at all...
12430                                  */
12431                                 goto out;
12432                         }
12433                 }
12434 
12435                 while (offs + total > woffs) {
12436                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12437                         size_t size;
12438 
12439                         if (epid == DTRACE_EPIDNONE) {
12440                                 size = sizeof (uint32_t);
12441                         } else {
12442                                 ASSERT3U(epid, <=, state->dts_necbs);
12443                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12444 
12445                                 size = state->dts_ecbs[epid - 1]->dte_size;
12446                         }
12447 
12448                         ASSERT(woffs + size <= buf->dtb_size);
12449                         ASSERT(size != 0);
12450 
12451                         if (woffs + size == buf->dtb_size) {
12452                                 /*
12453                                  * We've reached the end of the buffer; we want
12454                                  * to set the wrapped offset to 0 and break
12455                                  * out.  However, if the offs is 0, then we're
12456                                  * in a strange edge-condition:  the amount of
12457                                  * space that we want to reserve plus the size
12458                                  * of the record that we're overwriting is
12459                                  * greater than the size of the buffer.  This
12460                                  * is problematic because if we reserve the
12461                                  * space but subsequently don't consume it (due
12462                                  * to a failed predicate or error) the wrapped
12463                                  * offset will be 0 -- yet the EPID at offset 0
12464                                  * will not be committed.  This situation is
12465                                  * relatively easy to deal with:  if we're in
12466                                  * this case, the buffer is indistinguishable
12467                                  * from one that hasn't wrapped; we need only
12468                                  * finish the job by clearing the wrapped bit,
12469                                  * explicitly setting the offset to be 0, and
12470                                  * zero'ing out the old data in the buffer.
12471                                  */
12472                                 if (offs == 0) {
12473                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12474                                         buf->dtb_offset = 0;
12475                                         woffs = total;
12476 
12477                                         while (woffs < buf->dtb_size)
12478                                                 tomax[woffs++] = 0;
12479                                 }
12480 
12481                                 woffs = 0;
12482                                 break;
12483                         }
12484 
12485                         woffs += size;
12486                 }
12487 
12488                 /*
12489                  * We have a wrapped offset.  It may be that the wrapped offset
12490                  * has become zero -- that's okay.
12491                  */
12492                 buf->dtb_xamot_offset = woffs;
12493         }
12494 
12495 out:
12496         /*
12497          * Now we can plow the buffer with any necessary padding.
12498          */
12499         while (offs & (align - 1)) {
12500                 /*
12501                  * Assert that our alignment is off by a number which
12502                  * is itself sizeof (uint32_t) aligned.
12503                  */
12504                 ASSERT(!((align - (offs & (align - 1))) &
12505                     (sizeof (uint32_t) - 1)));
12506                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12507                 offs += sizeof (uint32_t);
12508         }
12509 
12510         if (buf->dtb_flags & DTRACEBUF_FILL) {
12511                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12512                         buf->dtb_flags |= DTRACEBUF_FULL;
12513                         return (-1);
12514                 }
12515         }
12516 
12517         if (mstate == NULL)
12518                 return (offs);
12519 
12520         /*
12521          * For ring buffers and fill buffers, the scratch space is always
12522          * the inactive buffer.
12523          */
12524         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12525         mstate->dtms_scratch_size = buf->dtb_size;
12526         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12527 
12528         return (offs);
12529 }
12530 
12531 static void
12532 dtrace_buffer_polish(dtrace_buffer_t *buf)
12533 {
12534         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12535         ASSERT(MUTEX_HELD(&dtrace_lock));
12536 
12537         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12538                 return;
12539 
12540         /*
12541          * We need to polish the ring buffer.  There are three cases:
12542          *
12543          * - The first (and presumably most common) is that there is no gap
12544          *   between the buffer offset and the wrapped offset.  In this case,
12545          *   there is nothing in the buffer that isn't valid data; we can
12546          *   mark the buffer as polished and return.
12547          *
12548          * - The second (less common than the first but still more common
12549          *   than the third) is that there is a gap between the buffer offset
12550          *   and the wrapped offset, and the wrapped offset is larger than the
12551          *   buffer offset.  This can happen because of an alignment issue, or
12552          *   can happen because of a call to dtrace_buffer_reserve() that
12553          *   didn't subsequently consume the buffer space.  In this case,
12554          *   we need to zero the data from the buffer offset to the wrapped
12555          *   offset.
12556          *
12557          * - The third (and least common) is that there is a gap between the
12558          *   buffer offset and the wrapped offset, but the wrapped offset is
12559          *   _less_ than the buffer offset.  This can only happen because a
12560          *   call to dtrace_buffer_reserve() induced a wrap, but the space
12561          *   was not subsequently consumed.  In this case, we need to zero the
12562          *   space from the offset to the end of the buffer _and_ from the
12563          *   top of the buffer to the wrapped offset.
12564          */
12565         if (buf->dtb_offset < buf->dtb_xamot_offset) {
12566                 bzero(buf->dtb_tomax + buf->dtb_offset,
12567                     buf->dtb_xamot_offset - buf->dtb_offset);
12568         }
12569 
12570         if (buf->dtb_offset > buf->dtb_xamot_offset) {
12571                 bzero(buf->dtb_tomax + buf->dtb_offset,
12572                     buf->dtb_size - buf->dtb_offset);
12573                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12574         }
12575 }
12576 
12577 /*
12578  * This routine determines if data generated at the specified time has likely
12579  * been entirely consumed at user-level.  This routine is called to determine
12580  * if an ECB on a defunct probe (but for an active enabling) can be safely
12581  * disabled and destroyed.
12582  */
12583 static int
12584 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12585 {
12586         int i;
12587 
12588         for (i = 0; i < NCPU; i++) {
12589                 dtrace_buffer_t *buf = &bufs[i];
12590 
12591                 if (buf->dtb_size == 0)
12592                         continue;
12593 
12594                 if (buf->dtb_flags & DTRACEBUF_RING)
12595                         return (0);
12596 
12597                 if (!buf->dtb_switched && buf->dtb_offset != 0)
12598                         return (0);
12599 
12600                 if (buf->dtb_switched - buf->dtb_interval < when)
12601                         return (0);
12602         }
12603 
12604         return (1);
12605 }
12606 
12607 static void
12608 dtrace_buffer_free(dtrace_buffer_t *bufs)
12609 {
12610         int i;
12611 
12612         for (i = 0; i < NCPU; i++) {
12613                 dtrace_buffer_t *buf = &bufs[i];
12614 
12615                 if (buf->dtb_tomax == NULL) {
12616                         ASSERT(buf->dtb_xamot == NULL);
12617                         ASSERT(buf->dtb_size == 0);
12618                         continue;
12619                 }
12620 
12621                 if (buf->dtb_xamot != NULL) {
12622                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12623                         kmem_free(buf->dtb_xamot, buf->dtb_size);
12624                 }
12625 
12626                 kmem_free(buf->dtb_tomax, buf->dtb_size);
12627                 buf->dtb_size = 0;
12628                 buf->dtb_tomax = NULL;
12629                 buf->dtb_xamot = NULL;
12630         }
12631 }
12632 
12633 /*
12634  * DTrace Enabling Functions
12635  */
12636 static dtrace_enabling_t *
12637 dtrace_enabling_create(dtrace_vstate_t *vstate)
12638 {
12639         dtrace_enabling_t *enab;
12640 
12641         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12642         enab->dten_vstate = vstate;
12643 
12644         return (enab);
12645 }
12646 
12647 static void
12648 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12649 {
12650         dtrace_ecbdesc_t **ndesc;
12651         size_t osize, nsize;
12652 
12653         /*
12654          * We can't add to enablings after we've enabled them, or after we've
12655          * retained them.
12656          */
12657         ASSERT(enab->dten_probegen == 0);
12658         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12659 
12660         if (enab->dten_ndesc < enab->dten_maxdesc) {
12661                 enab->dten_desc[enab->dten_ndesc++] = ecb;
12662                 return;
12663         }
12664 
12665         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12666 
12667         if (enab->dten_maxdesc == 0) {
12668                 enab->dten_maxdesc = 1;
12669         } else {
12670                 enab->dten_maxdesc <<= 1;
12671         }
12672 
12673         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12674 
12675         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12676         ndesc = kmem_zalloc(nsize, KM_SLEEP);
12677         bcopy(enab->dten_desc, ndesc, osize);
12678         if (enab->dten_desc != NULL)
12679                 kmem_free(enab->dten_desc, osize);
12680 
12681         enab->dten_desc = ndesc;
12682         enab->dten_desc[enab->dten_ndesc++] = ecb;
12683 }
12684 
12685 static void
12686 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12687     dtrace_probedesc_t *pd)
12688 {
12689         dtrace_ecbdesc_t *new;
12690         dtrace_predicate_t *pred;
12691         dtrace_actdesc_t *act;
12692 
12693         /*
12694          * We're going to create a new ECB description that matches the
12695          * specified ECB in every way, but has the specified probe description.
12696          */
12697         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12698 
12699         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12700                 dtrace_predicate_hold(pred);
12701 
12702         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12703                 dtrace_actdesc_hold(act);
12704 
12705         new->dted_action = ecb->dted_action;
12706         new->dted_pred = ecb->dted_pred;
12707         new->dted_probe = *pd;
12708         new->dted_uarg = ecb->dted_uarg;
12709 
12710         dtrace_enabling_add(enab, new);
12711 }
12712 
12713 static void
12714 dtrace_enabling_dump(dtrace_enabling_t *enab)
12715 {
12716         int i;
12717 
12718         for (i = 0; i < enab->dten_ndesc; i++) {
12719                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12720 
12721 #ifdef __FreeBSD__
12722                 printf("dtrace: enabling probe %d (%s:%s:%s:%s)\n", i,
12723                     desc->dtpd_provider, desc->dtpd_mod,
12724                     desc->dtpd_func, desc->dtpd_name);
12725 #else
12726                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12727                     desc->dtpd_provider, desc->dtpd_mod,
12728                     desc->dtpd_func, desc->dtpd_name);
12729 #endif
12730         }
12731 }
12732 
12733 static void
12734 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12735 {
12736         int i;
12737         dtrace_ecbdesc_t *ep;
12738         dtrace_vstate_t *vstate = enab->dten_vstate;
12739 
12740         ASSERT(MUTEX_HELD(&dtrace_lock));
12741 
12742         for (i = 0; i < enab->dten_ndesc; i++) {
12743                 dtrace_actdesc_t *act, *next;
12744                 dtrace_predicate_t *pred;
12745 
12746                 ep = enab->dten_desc[i];
12747 
12748                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12749                         dtrace_predicate_release(pred, vstate);
12750 
12751                 for (act = ep->dted_action; act != NULL; act = next) {
12752                         next = act->dtad_next;
12753                         dtrace_actdesc_release(act, vstate);
12754                 }
12755 
12756                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12757         }
12758 
12759         if (enab->dten_desc != NULL)
12760                 kmem_free(enab->dten_desc,
12761                     enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12762 
12763         /*
12764          * If this was a retained enabling, decrement the dts_nretained count
12765          * and take it off of the dtrace_retained list.
12766          */
12767         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12768             dtrace_retained == enab) {
12769                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12770                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12771                 enab->dten_vstate->dtvs_state->dts_nretained--;
12772                 dtrace_retained_gen++;
12773         }
12774 
12775         if (enab->dten_prev == NULL) {
12776                 if (dtrace_retained == enab) {
12777                         dtrace_retained = enab->dten_next;
12778 
12779                         if (dtrace_retained != NULL)
12780                                 dtrace_retained->dten_prev = NULL;
12781                 }
12782         } else {
12783                 ASSERT(enab != dtrace_retained);
12784                 ASSERT(dtrace_retained != NULL);
12785                 enab->dten_prev->dten_next = enab->dten_next;
12786         }
12787 
12788         if (enab->dten_next != NULL) {
12789                 ASSERT(dtrace_retained != NULL);
12790                 enab->dten_next->dten_prev = enab->dten_prev;
12791         }
12792 
12793         kmem_free(enab, sizeof (dtrace_enabling_t));
12794 }
12795 
12796 static int
12797 dtrace_enabling_retain(dtrace_enabling_t *enab)
12798 {
12799         dtrace_state_t *state;
12800 
12801         ASSERT(MUTEX_HELD(&dtrace_lock));
12802         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12803         ASSERT(enab->dten_vstate != NULL);
12804 
12805         state = enab->dten_vstate->dtvs_state;
12806         ASSERT(state != NULL);
12807 
12808         /*
12809          * We only allow each state to retain dtrace_retain_max enablings.
12810          */
12811         if (state->dts_nretained >= dtrace_retain_max)
12812                 return (ENOSPC);
12813 
12814         state->dts_nretained++;
12815         dtrace_retained_gen++;
12816 
12817         if (dtrace_retained == NULL) {
12818                 dtrace_retained = enab;
12819                 return (0);
12820         }
12821 
12822         enab->dten_next = dtrace_retained;
12823         dtrace_retained->dten_prev = enab;
12824         dtrace_retained = enab;
12825 
12826         return (0);
12827 }
12828 
12829 static int
12830 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12831     dtrace_probedesc_t *create)
12832 {
12833         dtrace_enabling_t *new, *enab;
12834         int found = 0, err = ENOENT;
12835 
12836         ASSERT(MUTEX_HELD(&dtrace_lock));
12837         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12838         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12839         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12840         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12841 
12842         new = dtrace_enabling_create(&state->dts_vstate);
12843 
12844         /*
12845          * Iterate over all retained enablings, looking for enablings that
12846          * match the specified state.
12847          */
12848         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12849                 int i;
12850 
12851                 /*
12852                  * dtvs_state can only be NULL for helper enablings -- and
12853                  * helper enablings can't be retained.
12854                  */
12855                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12856 
12857                 if (enab->dten_vstate->dtvs_state != state)
12858                         continue;
12859 
12860                 /*
12861                  * Now iterate over each probe description; we're looking for
12862                  * an exact match to the specified probe description.
12863                  */
12864                 for (i = 0; i < enab->dten_ndesc; i++) {
12865                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12866                         dtrace_probedesc_t *pd = &ep->dted_probe;
12867 
12868                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12869                                 continue;
12870 
12871                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12872                                 continue;
12873 
12874                         if (strcmp(pd->dtpd_func, match->dtpd_func))
12875                                 continue;
12876 
12877                         if (strcmp(pd->dtpd_name, match->dtpd_name))
12878                                 continue;
12879 
12880                         /*
12881                          * We have a winning probe!  Add it to our growing
12882                          * enabling.
12883                          */
12884                         found = 1;
12885                         dtrace_enabling_addlike(new, ep, create);
12886                 }
12887         }
12888 
12889         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12890                 dtrace_enabling_destroy(new);
12891                 return (err);
12892         }
12893 
12894         return (0);
12895 }
12896 
12897 static void
12898 dtrace_enabling_retract(dtrace_state_t *state)
12899 {
12900         dtrace_enabling_t *enab, *next;
12901 
12902         ASSERT(MUTEX_HELD(&dtrace_lock));
12903 
12904         /*
12905          * Iterate over all retained enablings, destroy the enablings retained
12906          * for the specified state.
12907          */
12908         for (enab = dtrace_retained; enab != NULL; enab = next) {
12909                 next = enab->dten_next;
12910 
12911                 /*
12912                  * dtvs_state can only be NULL for helper enablings -- and
12913                  * helper enablings can't be retained.
12914                  */
12915                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12916 
12917                 if (enab->dten_vstate->dtvs_state == state) {
12918                         ASSERT(state->dts_nretained > 0);
12919                         dtrace_enabling_destroy(enab);
12920                 }
12921         }
12922 
12923         ASSERT(state->dts_nretained == 0);
12924 }
12925 
12926 static int
12927 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12928 {
12929         int i = 0;
12930         int matched = 0;
12931 
12932         ASSERT(MUTEX_HELD(&cpu_lock));
12933         ASSERT(MUTEX_HELD(&dtrace_lock));
12934 
12935         for (i = 0; i < enab->dten_ndesc; i++) {
12936                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12937 
12938                 enab->dten_current = ep;
12939                 enab->dten_error = 0;
12940 
12941                 matched += dtrace_probe_enable(&ep->dted_probe, enab);
12942 
12943                 if (enab->dten_error != 0) {
12944                         /*
12945                          * If we get an error half-way through enabling the
12946                          * probes, we kick out -- perhaps with some number of
12947                          * them enabled.  Leaving enabled probes enabled may
12948                          * be slightly confusing for user-level, but we expect
12949                          * that no one will attempt to actually drive on in
12950                          * the face of such errors.  If this is an anonymous
12951                          * enabling (indicated with a NULL nmatched pointer),
12952                          * we cmn_err() a message.  We aren't expecting to
12953                          * get such an error -- such as it can exist at all,
12954                          * it would be a result of corrupted DOF in the driver
12955                          * properties.
12956                          */
12957                         if (nmatched == NULL) {
12958                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12959                                     "error on %p: %d", (void *)ep,
12960                                     enab->dten_error);
12961                         }
12962 
12963                         return (enab->dten_error);
12964                 }
12965         }
12966 
12967         enab->dten_probegen = dtrace_probegen;
12968         if (nmatched != NULL)
12969                 *nmatched = matched;
12970 
12971         return (0);
12972 }
12973 
12974 static void
12975 dtrace_enabling_matchall(void)
12976 {
12977         dtrace_enabling_t *enab;
12978 
12979         mutex_enter(&cpu_lock);
12980         mutex_enter(&dtrace_lock);
12981 
12982         /*
12983          * Iterate over all retained enablings to see if any probes match
12984          * against them.  We only perform this operation on enablings for which
12985          * we have sufficient permissions by virtue of being in the global zone
12986          * or in the same zone as the DTrace client.  Because we can be called
12987          * after dtrace_detach() has been called, we cannot assert that there
12988          * are retained enablings.  We can safely load from dtrace_retained,
12989          * however:  the taskq_destroy() at the end of dtrace_detach() will
12990          * block pending our completion.
12991          */
12992         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12993 #ifdef illumos
12994                 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12995 
12996                 if (INGLOBALZONE(curproc) ||
12997                     cr != NULL && getzoneid() == crgetzoneid(cr))
12998 #endif
12999                         (void) dtrace_enabling_match(enab, NULL);
13000         }
13001 
13002         mutex_exit(&dtrace_lock);
13003         mutex_exit(&cpu_lock);
13004 }
13005 
13006 /*
13007  * If an enabling is to be enabled without having matched probes (that is, if
13008  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
13009  * enabling must be _primed_ by creating an ECB for every ECB description.
13010  * This must be done to assure that we know the number of speculations, the
13011  * number of aggregations, the minimum buffer size needed, etc. before we
13012  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
13013  * enabling any probes, we create ECBs for every ECB decription, but with a
13014  * NULL probe -- which is exactly what this function does.
13015  */
13016 static void
13017 dtrace_enabling_prime(dtrace_state_t *state)
13018 {
13019         dtrace_enabling_t *enab;
13020         int i;
13021 
13022         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
13023                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
13024 
13025                 if (enab->dten_vstate->dtvs_state != state)
13026                         continue;
13027 
13028                 /*
13029                  * We don't want to prime an enabling more than once, lest
13030                  * we allow a malicious user to induce resource exhaustion.
13031                  * (The ECBs that result from priming an enabling aren't
13032                  * leaked -- but they also aren't deallocated until the
13033                  * consumer state is destroyed.)
13034                  */
13035                 if (enab->dten_primed)
13036                         continue;
13037 
13038                 for (i = 0; i < enab->dten_ndesc; i++) {
13039                         enab->dten_current = enab->dten_desc[i];
13040                         (void) dtrace_probe_enable(NULL, enab);
13041                 }
13042 
13043                 enab->dten_primed = 1;
13044         }
13045 }
13046 
13047 /*
13048  * Called to indicate that probes should be provided due to retained
13049  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
13050  * must take an initial lap through the enabling calling the dtps_provide()
13051  * entry point explicitly to allow for autocreated probes.
13052  */
13053 static void
13054 dtrace_enabling_provide(dtrace_provider_t *prv)
13055 {
13056         int i, all = 0;
13057         dtrace_probedesc_t desc;
13058         dtrace_genid_t gen;
13059 
13060         ASSERT(MUTEX_HELD(&dtrace_lock));
13061         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
13062 
13063         if (prv == NULL) {
13064                 all = 1;
13065                 prv = dtrace_provider;
13066         }
13067 
13068         do {
13069                 dtrace_enabling_t *enab;
13070                 void *parg = prv->dtpv_arg;
13071 
13072 retry:
13073                 gen = dtrace_retained_gen;
13074                 for (enab = dtrace_retained; enab != NULL;
13075                     enab = enab->dten_next) {
13076                         for (i = 0; i < enab->dten_ndesc; i++) {
13077                                 desc = enab->dten_desc[i]->dted_probe;
13078                                 mutex_exit(&dtrace_lock);
13079                                 prv->dtpv_pops.dtps_provide(parg, &desc);
13080                                 mutex_enter(&dtrace_lock);
13081                                 /*
13082                                  * Process the retained enablings again if
13083                                  * they have changed while we weren't holding
13084                                  * dtrace_lock.
13085                                  */
13086                                 if (gen != dtrace_retained_gen)
13087                                         goto retry;
13088                         }
13089                 }
13090         } while (all && (prv = prv->dtpv_next) != NULL);
13091 
13092         mutex_exit(&dtrace_lock);
13093         dtrace_probe_provide(NULL, all ? NULL : prv);
13094         mutex_enter(&dtrace_lock);
13095 }
13096 
13097 /*
13098  * Called to reap ECBs that are attached to probes from defunct providers.
13099  */
13100 static void
13101 dtrace_enabling_reap(void)
13102 {
13103         dtrace_provider_t *prov;
13104         dtrace_probe_t *probe;
13105         dtrace_ecb_t *ecb;
13106         hrtime_t when;
13107         int i;
13108 
13109         mutex_enter(&cpu_lock);
13110         mutex_enter(&dtrace_lock);
13111 
13112         for (i = 0; i < dtrace_nprobes; i++) {
13113                 if ((probe = dtrace_probes[i]) == NULL)
13114                         continue;
13115 
13116                 if (probe->dtpr_ecb == NULL)
13117                         continue;
13118 
13119                 prov = probe->dtpr_provider;
13120 
13121                 if ((when = prov->dtpv_defunct) == 0)
13122                         continue;
13123 
13124                 /*
13125                  * We have ECBs on a defunct provider:  we want to reap these
13126                  * ECBs to allow the provider to unregister.  The destruction
13127                  * of these ECBs must be done carefully:  if we destroy the ECB
13128                  * and the consumer later wishes to consume an EPID that
13129                  * corresponds to the destroyed ECB (and if the EPID metadata
13130                  * has not been previously consumed), the consumer will abort
13131                  * processing on the unknown EPID.  To reduce (but not, sadly,
13132                  * eliminate) the possibility of this, we will only destroy an
13133                  * ECB for a defunct provider if, for the state that
13134                  * corresponds to the ECB:
13135                  *
13136                  *  (a) There is no speculative tracing (which can effectively
13137                  *      cache an EPID for an arbitrary amount of time).
13138                  *
13139                  *  (b) The principal buffers have been switched twice since the
13140                  *      provider became defunct.
13141                  *
13142                  *  (c) The aggregation buffers are of zero size or have been
13143                  *      switched twice since the provider became defunct.
13144                  *
13145                  * We use dts_speculates to determine (a) and call a function
13146                  * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
13147                  * that as soon as we've been unable to destroy one of the ECBs
13148                  * associated with the probe, we quit trying -- reaping is only
13149                  * fruitful in as much as we can destroy all ECBs associated
13150                  * with the defunct provider's probes.
13151                  */
13152                 while ((ecb = probe->dtpr_ecb) != NULL) {
13153                         dtrace_state_t *state = ecb->dte_state;
13154                         dtrace_buffer_t *buf = state->dts_buffer;
13155                         dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
13156 
13157                         if (state->dts_speculates)
13158                                 break;
13159 
13160                         if (!dtrace_buffer_consumed(buf, when))
13161                                 break;
13162 
13163                         if (!dtrace_buffer_consumed(aggbuf, when))
13164                                 break;
13165 
13166                         dtrace_ecb_disable(ecb);
13167                         ASSERT(probe->dtpr_ecb != ecb);
13168                         dtrace_ecb_destroy(ecb);
13169                 }
13170         }
13171 
13172         mutex_exit(&dtrace_lock);
13173         mutex_exit(&cpu_lock);
13174 }
13175 
13176 /*
13177  * DTrace DOF Functions
13178  */
13179 /*ARGSUSED*/
13180 static void
13181 dtrace_dof_error(dof_hdr_t *dof, const char *str)
13182 {
13183         if (dtrace_err_verbose)
13184                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
13185 
13186 #ifdef DTRACE_ERRDEBUG
13187         dtrace_errdebug(str);
13188 #endif
13189 }
13190 
13191 /*
13192  * Create DOF out of a currently enabled state.  Right now, we only create
13193  * DOF containing the run-time options -- but this could be expanded to create
13194  * complete DOF representing the enabled state.
13195  */
13196 static dof_hdr_t *
13197 dtrace_dof_create(dtrace_state_t *state)
13198 {
13199         dof_hdr_t *dof;
13200         dof_sec_t *sec;
13201         dof_optdesc_t *opt;
13202         int i, len = sizeof (dof_hdr_t) +
13203             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
13204             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13205 
13206         ASSERT(MUTEX_HELD(&dtrace_lock));
13207 
13208         dof = kmem_zalloc(len, KM_SLEEP);
13209         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
13210         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
13211         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
13212         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
13213 
13214         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
13215         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
13216         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
13217         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
13218         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
13219         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
13220 
13221         dof->dofh_flags = 0;
13222         dof->dofh_hdrsize = sizeof (dof_hdr_t);
13223         dof->dofh_secsize = sizeof (dof_sec_t);
13224         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
13225         dof->dofh_secoff = sizeof (dof_hdr_t);
13226         dof->dofh_loadsz = len;
13227         dof->dofh_filesz = len;
13228         dof->dofh_pad = 0;
13229 
13230         /*
13231          * Fill in the option section header...
13232          */
13233         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
13234         sec->dofs_type = DOF_SECT_OPTDESC;
13235         sec->dofs_align = sizeof (uint64_t);
13236         sec->dofs_flags = DOF_SECF_LOAD;
13237         sec->dofs_entsize = sizeof (dof_optdesc_t);
13238 
13239         opt = (dof_optdesc_t *)((uintptr_t)sec +
13240             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
13241 
13242         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
13243         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13244 
13245         for (i = 0; i < DTRACEOPT_MAX; i++) {
13246                 opt[i].dofo_option = i;
13247                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
13248                 opt[i].dofo_value = state->dts_options[i];
13249         }
13250 
13251         return (dof);
13252 }
13253 
13254 static dof_hdr_t *
13255 dtrace_dof_copyin(uintptr_t uarg, int *errp)
13256 {
13257         dof_hdr_t hdr, *dof;
13258 
13259         ASSERT(!MUTEX_HELD(&dtrace_lock));
13260 
13261         /*
13262          * First, we're going to copyin() the sizeof (dof_hdr_t).
13263          */
13264         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
13265                 dtrace_dof_error(NULL, "failed to copyin DOF header");
13266                 *errp = EFAULT;
13267                 return (NULL);
13268         }
13269 
13270         /*
13271          * Now we'll allocate the entire DOF and copy it in -- provided
13272          * that the length isn't outrageous.
13273          */
13274         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
13275                 dtrace_dof_error(&hdr, "load size exceeds maximum");
13276                 *errp = E2BIG;
13277                 return (NULL);
13278         }
13279 
13280         if (hdr.dofh_loadsz < sizeof (hdr)) {
13281                 dtrace_dof_error(&hdr, "invalid load size");
13282                 *errp = EINVAL;
13283                 return (NULL);
13284         }
13285 
13286         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
13287 
13288         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
13289             dof->dofh_loadsz != hdr.dofh_loadsz) {
13290                 kmem_free(dof, hdr.dofh_loadsz);
13291                 *errp = EFAULT;
13292                 return (NULL);
13293         }
13294 
13295         return (dof);
13296 }
13297 
13298 #ifdef __FreeBSD__
13299 static dof_hdr_t *
13300 dtrace_dof_copyin_proc(struct proc *p, uintptr_t uarg, int *errp)
13301 {
13302         dof_hdr_t hdr, *dof;
13303         struct thread *td;
13304         size_t loadsz;
13305 
13306         ASSERT(!MUTEX_HELD(&dtrace_lock));
13307 
13308         td = curthread;
13309 
13310         /*
13311          * First, we're going to copyin() the sizeof (dof_hdr_t).
13312          */
13313         if (proc_readmem(td, p, uarg, &hdr, sizeof(hdr)) != sizeof(hdr)) {
13314                 dtrace_dof_error(NULL, "failed to copyin DOF header");
13315                 *errp = EFAULT;
13316                 return (NULL);
13317         }
13318 
13319         /*
13320          * Now we'll allocate the entire DOF and copy it in -- provided
13321          * that the length isn't outrageous.
13322          */
13323         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
13324                 dtrace_dof_error(&hdr, "load size exceeds maximum");
13325                 *errp = E2BIG;
13326                 return (NULL);
13327         }
13328         loadsz = (size_t)hdr.dofh_loadsz;
13329 
13330         if (loadsz < sizeof (hdr)) {
13331                 dtrace_dof_error(&hdr, "invalid load size");
13332                 *errp = EINVAL;
13333                 return (NULL);
13334         }
13335 
13336         dof = kmem_alloc(loadsz, KM_SLEEP);
13337 
13338         if (proc_readmem(td, p, uarg, dof, loadsz) != loadsz ||
13339             dof->dofh_loadsz != loadsz) {
13340                 kmem_free(dof, hdr.dofh_loadsz);
13341                 *errp = EFAULT;
13342                 return (NULL);
13343         }
13344 
13345         return (dof);
13346 }
13347 
13348 static __inline uchar_t
13349 dtrace_dof_char(char c)
13350 {
13351 
13352         switch (c) {
13353         case '':
13354         case '1':
13355         case '2':
13356         case '3':
13357         case '4':
13358         case '5':
13359         case '6':
13360         case '7':
13361         case '8':
13362         case '9':
13363                 return (c - '');
13364         case 'A':
13365         case 'B':
13366         case 'C':
13367         case 'D':
13368         case 'E':
13369         case 'F':
13370                 return (c - 'A' + 10);
13371         case 'a':
13372         case 'b':
13373         case 'c':
13374         case 'd':
13375         case 'e':
13376         case 'f':
13377                 return (c - 'a' + 10);
13378         }
13379         /* Should not reach here. */
13380         return (UCHAR_MAX);
13381 }
13382 #endif /* __FreeBSD__ */
13383 
13384 static dof_hdr_t *
13385 dtrace_dof_property(const char *name)
13386 {
13387 #ifdef __FreeBSD__
13388         uint8_t *dofbuf;
13389         u_char *data, *eol;
13390         caddr_t doffile;
13391         size_t bytes, len, i;
13392         dof_hdr_t *dof;
13393         u_char c1, c2;
13394 
13395         dof = NULL;
13396 
13397         doffile = preload_search_by_type("dtrace_dof");
13398         if (doffile == NULL)
13399                 return (NULL);
13400 
13401         data = preload_fetch_addr(doffile);
13402         len = preload_fetch_size(doffile);
13403         for (;;) {
13404                 /* Look for the end of the line. All lines end in a newline. */
13405                 eol = memchr(data, '\n', len);
13406                 if (eol == NULL)
13407                         return (NULL);
13408 
13409                 if (strncmp(name, data, strlen(name)) == 0)
13410                         break;
13411 
13412                 eol++; /* skip past the newline */
13413                 len -= eol - data;
13414                 data = eol;
13415         }
13416 
13417         /* We've found the data corresponding to the specified key. */
13418 
13419         data += strlen(name) + 1; /* skip past the '=' */
13420         len = eol - data;
13421         if (len % 2 != 0) {
13422                 dtrace_dof_error(NULL, "invalid DOF encoding length");
13423                 goto doferr;
13424         }
13425         bytes = len / 2;
13426         if (bytes < sizeof(dof_hdr_t)) {
13427                 dtrace_dof_error(NULL, "truncated header");
13428                 goto doferr;
13429         }
13430 
13431         /*
13432          * Each byte is represented by the two ASCII characters in its hex
13433          * representation.
13434          */
13435         dofbuf = malloc(bytes, M_SOLARIS, M_WAITOK);
13436         for (i = 0; i < bytes; i++) {
13437                 c1 = dtrace_dof_char(data[i * 2]);
13438                 c2 = dtrace_dof_char(data[i * 2 + 1]);
13439                 if (c1 == UCHAR_MAX || c2 == UCHAR_MAX) {
13440                         dtrace_dof_error(NULL, "invalid hex char in DOF");
13441                         goto doferr;
13442                 }
13443                 dofbuf[i] = c1 * 16 + c2;
13444         }
13445 
13446         dof = (dof_hdr_t *)dofbuf;
13447         if (bytes < dof->dofh_loadsz) {
13448                 dtrace_dof_error(NULL, "truncated DOF");
13449                 goto doferr;
13450         }
13451 
13452         if (dof->dofh_loadsz >= dtrace_dof_maxsize) {
13453                 dtrace_dof_error(NULL, "oversized DOF");
13454                 goto doferr;
13455         }
13456 
13457         return (dof);
13458 
13459 doferr:
13460         free(dof, M_SOLARIS);
13461         return (NULL);
13462 #else /* __FreeBSD__ */
13463         uchar_t *buf;
13464         uint64_t loadsz;
13465         unsigned int len, i;
13466         dof_hdr_t *dof;
13467 
13468         /*
13469          * Unfortunately, array of values in .conf files are always (and
13470          * only) interpreted to be integer arrays.  We must read our DOF
13471          * as an integer array, and then squeeze it into a byte array.
13472          */
13473         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13474             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13475                 return (NULL);
13476 
13477         for (i = 0; i < len; i++)
13478                 buf[i] = (uchar_t)(((int *)buf)[i]);
13479 
13480         if (len < sizeof (dof_hdr_t)) {
13481                 ddi_prop_free(buf);
13482                 dtrace_dof_error(NULL, "truncated header");
13483                 return (NULL);
13484         }
13485 
13486         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13487                 ddi_prop_free(buf);
13488                 dtrace_dof_error(NULL, "truncated DOF");
13489                 return (NULL);
13490         }
13491 
13492         if (loadsz >= dtrace_dof_maxsize) {
13493                 ddi_prop_free(buf);
13494                 dtrace_dof_error(NULL, "oversized DOF");
13495                 return (NULL);
13496         }
13497 
13498         dof = kmem_alloc(loadsz, KM_SLEEP);
13499         bcopy(buf, dof, loadsz);
13500         ddi_prop_free(buf);
13501 
13502         return (dof);
13503 #endif /* !__FreeBSD__ */
13504 }
13505 
13506 static void
13507 dtrace_dof_destroy(dof_hdr_t *dof)
13508 {
13509         kmem_free(dof, dof->dofh_loadsz);
13510 }
13511 
13512 /*
13513  * Return the dof_sec_t pointer corresponding to a given section index.  If the
13514  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13515  * a type other than DOF_SECT_NONE is specified, the header is checked against
13516  * this type and NULL is returned if the types do not match.
13517  */
13518 static dof_sec_t *
13519 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13520 {
13521         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13522             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13523 
13524         if (i >= dof->dofh_secnum) {
13525                 dtrace_dof_error(dof, "referenced section index is invalid");
13526                 return (NULL);
13527         }
13528 
13529         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13530                 dtrace_dof_error(dof, "referenced section is not loadable");
13531                 return (NULL);
13532         }
13533 
13534         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13535                 dtrace_dof_error(dof, "referenced section is the wrong type");
13536                 return (NULL);
13537         }
13538 
13539         return (sec);
13540 }
13541 
13542 static dtrace_probedesc_t *
13543 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13544 {
13545         dof_probedesc_t *probe;
13546         dof_sec_t *strtab;
13547         uintptr_t daddr = (uintptr_t)dof;
13548         uintptr_t str;
13549         size_t size;
13550 
13551         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13552                 dtrace_dof_error(dof, "invalid probe section");
13553                 return (NULL);
13554         }
13555 
13556         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13557                 dtrace_dof_error(dof, "bad alignment in probe description");
13558                 return (NULL);
13559         }
13560 
13561         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13562                 dtrace_dof_error(dof, "truncated probe description");
13563                 return (NULL);
13564         }
13565 
13566         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13567         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13568 
13569         if (strtab == NULL)
13570                 return (NULL);
13571 
13572         str = daddr + strtab->dofs_offset;
13573         size = strtab->dofs_size;
13574 
13575         if (probe->dofp_provider >= strtab->dofs_size) {
13576                 dtrace_dof_error(dof, "corrupt probe provider");
13577                 return (NULL);
13578         }
13579 
13580         (void) strncpy(desc->dtpd_provider,
13581             (char *)(str + probe->dofp_provider),
13582             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13583 
13584         if (probe->dofp_mod >= strtab->dofs_size) {
13585                 dtrace_dof_error(dof, "corrupt probe module");
13586                 return (NULL);
13587         }
13588 
13589         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13590             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13591 
13592         if (probe->dofp_func >= strtab->dofs_size) {
13593                 dtrace_dof_error(dof, "corrupt probe function");
13594                 return (NULL);
13595         }
13596 
13597         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13598             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13599 
13600         if (probe->dofp_name >= strtab->dofs_size) {
13601                 dtrace_dof_error(dof, "corrupt probe name");
13602                 return (NULL);
13603         }
13604 
13605         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13606             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13607 
13608         return (desc);
13609 }
13610 
13611 static dtrace_difo_t *
13612 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13613     cred_t *cr)
13614 {
13615         dtrace_difo_t *dp;
13616         size_t ttl = 0;
13617         dof_difohdr_t *dofd;
13618         uintptr_t daddr = (uintptr_t)dof;
13619         size_t max = dtrace_difo_maxsize;
13620         int i, l, n;
13621 
13622         static const struct {
13623                 int section;
13624                 int bufoffs;
13625                 int lenoffs;
13626                 int entsize;
13627                 int align;
13628                 const char *msg;
13629         } difo[] = {
13630                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13631                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13632                 sizeof (dif_instr_t), "multiple DIF sections" },
13633 
13634                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13635                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13636                 sizeof (uint64_t), "multiple integer tables" },
13637 
13638                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13639                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13640                 sizeof (char), "multiple string tables" },
13641 
13642                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13643                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13644                 sizeof (uint_t), "multiple variable tables" },
13645 
13646                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13647         };
13648 
13649         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13650                 dtrace_dof_error(dof, "invalid DIFO header section");
13651                 return (NULL);
13652         }
13653 
13654         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13655                 dtrace_dof_error(dof, "bad alignment in DIFO header");
13656                 return (NULL);
13657         }
13658 
13659         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13660             sec->dofs_size % sizeof (dof_secidx_t)) {
13661                 dtrace_dof_error(dof, "bad size in DIFO header");
13662                 return (NULL);
13663         }
13664 
13665         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13666         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13667 
13668         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13669         dp->dtdo_rtype = dofd->dofd_rtype;
13670 
13671         for (l = 0; l < n; l++) {
13672                 dof_sec_t *subsec;
13673                 void **bufp;
13674                 uint32_t *lenp;
13675 
13676                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13677                     dofd->dofd_links[l])) == NULL)
13678                         goto err; /* invalid section link */
13679 
13680                 if (ttl + subsec->dofs_size > max) {
13681                         dtrace_dof_error(dof, "exceeds maximum size");
13682                         goto err;
13683                 }
13684 
13685                 ttl += subsec->dofs_size;
13686 
13687                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13688                         if (subsec->dofs_type != difo[i].section)
13689                                 continue;
13690 
13691                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13692                                 dtrace_dof_error(dof, "section not loaded");
13693                                 goto err;
13694                         }
13695 
13696                         if (subsec->dofs_align != difo[i].align) {
13697                                 dtrace_dof_error(dof, "bad alignment");
13698                                 goto err;
13699                         }
13700 
13701                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13702                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13703 
13704                         if (*bufp != NULL) {
13705                                 dtrace_dof_error(dof, difo[i].msg);
13706                                 goto err;
13707                         }
13708 
13709                         if (difo[i].entsize != subsec->dofs_entsize) {
13710                                 dtrace_dof_error(dof, "entry size mismatch");
13711                                 goto err;
13712                         }
13713 
13714                         if (subsec->dofs_entsize != 0 &&
13715                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13716                                 dtrace_dof_error(dof, "corrupt entry size");
13717                                 goto err;
13718                         }
13719 
13720                         *lenp = subsec->dofs_size;
13721                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13722                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13723                             *bufp, subsec->dofs_size);
13724 
13725                         if (subsec->dofs_entsize != 0)
13726                                 *lenp /= subsec->dofs_entsize;
13727 
13728                         break;
13729                 }
13730 
13731                 /*
13732                  * If we encounter a loadable DIFO sub-section that is not
13733                  * known to us, assume this is a broken program and fail.
13734                  */
13735                 if (difo[i].section == DOF_SECT_NONE &&
13736                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
13737                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
13738                         goto err;
13739                 }
13740         }
13741 
13742         if (dp->dtdo_buf == NULL) {
13743                 /*
13744                  * We can't have a DIF object without DIF text.
13745                  */
13746                 dtrace_dof_error(dof, "missing DIF text");
13747                 goto err;
13748         }
13749 
13750         /*
13751          * Before we validate the DIF object, run through the variable table
13752          * looking for the strings -- if any of their size are under, we'll set
13753          * their size to be the system-wide default string size.  Note that
13754          * this should _not_ happen if the "strsize" option has been set --
13755          * in this case, the compiler should have set the size to reflect the
13756          * setting of the option.
13757          */
13758         for (i = 0; i < dp->dtdo_varlen; i++) {
13759                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13760                 dtrace_diftype_t *t = &v->dtdv_type;
13761 
13762                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13763                         continue;
13764 
13765                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13766                         t->dtdt_size = dtrace_strsize_default;
13767         }
13768 
13769         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13770                 goto err;
13771 
13772         dtrace_difo_init(dp, vstate);
13773         return (dp);
13774 
13775 err:
13776         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13777         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13778         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13779         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13780 
13781         kmem_free(dp, sizeof (dtrace_difo_t));
13782         return (NULL);
13783 }
13784 
13785 static dtrace_predicate_t *
13786 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13787     cred_t *cr)
13788 {
13789         dtrace_difo_t *dp;
13790 
13791         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13792                 return (NULL);
13793 
13794         return (dtrace_predicate_create(dp));
13795 }
13796 
13797 static dtrace_actdesc_t *
13798 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13799     cred_t *cr)
13800 {
13801         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13802         dof_actdesc_t *desc;
13803         dof_sec_t *difosec;
13804         size_t offs;
13805         uintptr_t daddr = (uintptr_t)dof;
13806         uint64_t arg;
13807         dtrace_actkind_t kind;
13808 
13809         if (sec->dofs_type != DOF_SECT_ACTDESC) {
13810                 dtrace_dof_error(dof, "invalid action section");
13811                 return (NULL);
13812         }
13813 
13814         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13815                 dtrace_dof_error(dof, "truncated action description");
13816                 return (NULL);
13817         }
13818 
13819         if (sec->dofs_align != sizeof (uint64_t)) {
13820                 dtrace_dof_error(dof, "bad alignment in action description");
13821                 return (NULL);
13822         }
13823 
13824         if (sec->dofs_size < sec->dofs_entsize) {
13825                 dtrace_dof_error(dof, "section entry size exceeds total size");
13826                 return (NULL);
13827         }
13828 
13829         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13830                 dtrace_dof_error(dof, "bad entry size in action description");
13831                 return (NULL);
13832         }
13833 
13834         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13835                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13836                 return (NULL);
13837         }
13838 
13839         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13840                 desc = (dof_actdesc_t *)(daddr +
13841                     (uintptr_t)sec->dofs_offset + offs);
13842                 kind = (dtrace_actkind_t)desc->dofa_kind;
13843 
13844                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13845                     (kind != DTRACEACT_PRINTA ||
13846                     desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13847                     (kind == DTRACEACT_DIFEXPR &&
13848                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
13849                         dof_sec_t *strtab;
13850                         char *str, *fmt;
13851                         uint64_t i;
13852 
13853                         /*
13854                          * The argument to these actions is an index into the
13855                          * DOF string table.  For printf()-like actions, this
13856                          * is the format string.  For print(), this is the
13857                          * CTF type of the expression result.
13858                          */
13859                         if ((strtab = dtrace_dof_sect(dof,
13860                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13861                                 goto err;
13862 
13863                         str = (char *)((uintptr_t)dof +
13864                             (uintptr_t)strtab->dofs_offset);
13865 
13866                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13867                                 if (str[i] == '\0')
13868                                         break;
13869                         }
13870 
13871                         if (i >= strtab->dofs_size) {
13872                                 dtrace_dof_error(dof, "bogus format string");
13873                                 goto err;
13874                         }
13875 
13876                         if (i == desc->dofa_arg) {
13877                                 dtrace_dof_error(dof, "empty format string");
13878                                 goto err;
13879                         }
13880 
13881                         i -= desc->dofa_arg;
13882                         fmt = kmem_alloc(i + 1, KM_SLEEP);
13883                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
13884                         arg = (uint64_t)(uintptr_t)fmt;
13885                 } else {
13886                         if (kind == DTRACEACT_PRINTA) {
13887                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13888                                 arg = 0;
13889                         } else {
13890                                 arg = desc->dofa_arg;
13891                         }
13892                 }
13893 
13894                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13895                     desc->dofa_uarg, arg);
13896 
13897                 if (last != NULL) {
13898                         last->dtad_next = act;
13899                 } else {
13900                         first = act;
13901                 }
13902 
13903                 last = act;
13904 
13905                 if (desc->dofa_difo == DOF_SECIDX_NONE)
13906                         continue;
13907 
13908                 if ((difosec = dtrace_dof_sect(dof,
13909                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13910                         goto err;
13911 
13912                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13913 
13914                 if (act->dtad_difo == NULL)
13915                         goto err;
13916         }
13917 
13918         ASSERT(first != NULL);
13919         return (first);
13920 
13921 err:
13922         for (act = first; act != NULL; act = next) {
13923                 next = act->dtad_next;
13924                 dtrace_actdesc_release(act, vstate);
13925         }
13926 
13927         return (NULL);
13928 }
13929 
13930 static dtrace_ecbdesc_t *
13931 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13932     cred_t *cr)
13933 {
13934         dtrace_ecbdesc_t *ep;
13935         dof_ecbdesc_t *ecb;
13936         dtrace_probedesc_t *desc;
13937         dtrace_predicate_t *pred = NULL;
13938 
13939         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13940                 dtrace_dof_error(dof, "truncated ECB description");
13941                 return (NULL);
13942         }
13943 
13944         if (sec->dofs_align != sizeof (uint64_t)) {
13945                 dtrace_dof_error(dof, "bad alignment in ECB description");
13946                 return (NULL);
13947         }
13948 
13949         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13950         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13951 
13952         if (sec == NULL)
13953                 return (NULL);
13954 
13955         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13956         ep->dted_uarg = ecb->dofe_uarg;
13957         desc = &ep->dted_probe;
13958 
13959         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13960                 goto err;
13961 
13962         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13963                 if ((sec = dtrace_dof_sect(dof,
13964                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13965                         goto err;
13966 
13967                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13968                         goto err;
13969 
13970                 ep->dted_pred.dtpdd_predicate = pred;
13971         }
13972 
13973         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13974                 if ((sec = dtrace_dof_sect(dof,
13975                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13976                         goto err;
13977 
13978                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13979 
13980                 if (ep->dted_action == NULL)
13981                         goto err;
13982         }
13983 
13984         return (ep);
13985 
13986 err:
13987         if (pred != NULL)
13988                 dtrace_predicate_release(pred, vstate);
13989         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13990         return (NULL);
13991 }
13992 
13993 /*
13994  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13995  * specified DOF.  SETX relocations are computed using 'ubase', the base load
13996  * address of the object containing the DOF, and DOFREL relocations are relative
13997  * to the relocation offset within the DOF.
13998  */
13999 static int
14000 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase,
14001     uint64_t udaddr)
14002 {
14003         uintptr_t daddr = (uintptr_t)dof;
14004         uintptr_t ts_end;
14005         dof_relohdr_t *dofr =
14006             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
14007         dof_sec_t *ss, *rs, *ts;
14008         dof_relodesc_t *r;
14009         uint_t i, n;
14010 
14011         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
14012             sec->dofs_align != sizeof (dof_secidx_t)) {
14013                 dtrace_dof_error(dof, "invalid relocation header");
14014                 return (-1);
14015         }
14016 
14017         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
14018         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
14019         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
14020         ts_end = (uintptr_t)ts + sizeof (dof_sec_t);
14021 
14022         if (ss == NULL || rs == NULL || ts == NULL)
14023                 return (-1); /* dtrace_dof_error() has been called already */
14024 
14025         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
14026             rs->dofs_align != sizeof (uint64_t)) {
14027                 dtrace_dof_error(dof, "invalid relocation section");
14028                 return (-1);
14029         }
14030 
14031         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
14032         n = rs->dofs_size / rs->dofs_entsize;
14033 
14034         for (i = 0; i < n; i++) {
14035                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
14036 
14037                 switch (r->dofr_type) {
14038                 case DOF_RELO_NONE:
14039                         break;
14040                 case DOF_RELO_SETX:
14041                 case DOF_RELO_DOFREL:
14042                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
14043                             sizeof (uint64_t) > ts->dofs_size) {
14044                                 dtrace_dof_error(dof, "bad relocation offset");
14045                                 return (-1);
14046                         }
14047 
14048                         if (taddr >= (uintptr_t)ts && taddr < ts_end) {
14049                                 dtrace_dof_error(dof, "bad relocation offset");
14050                                 return (-1);
14051                         }
14052 
14053                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
14054                                 dtrace_dof_error(dof, "misaligned setx relo");
14055                                 return (-1);
14056                         }
14057 
14058                         if (r->dofr_type == DOF_RELO_SETX)
14059                                 *(uint64_t *)taddr += ubase;
14060                         else
14061                                 *(uint64_t *)taddr +=
14062                                     udaddr + ts->dofs_offset + r->dofr_offset;
14063                         break;
14064                 default:
14065                         dtrace_dof_error(dof, "invalid relocation type");
14066                         return (-1);
14067                 }
14068 
14069                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
14070         }
14071 
14072         return (0);
14073 }
14074 
14075 /*
14076  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
14077  * header:  it should be at the front of a memory region that is at least
14078  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
14079  * size.  It need not be validated in any other way.
14080  */
14081 static int
14082 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
14083     dtrace_enabling_t **enabp, uint64_t ubase, uint64_t udaddr, int noprobes)
14084 {
14085         uint64_t len = dof->dofh_loadsz, seclen;
14086         uintptr_t daddr = (uintptr_t)dof;
14087         dtrace_ecbdesc_t *ep;
14088         dtrace_enabling_t *enab;
14089         uint_t i;
14090 
14091         ASSERT(MUTEX_HELD(&dtrace_lock));
14092         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
14093 
14094         /*
14095          * Check the DOF header identification bytes.  In addition to checking
14096          * valid settings, we also verify that unused bits/bytes are zeroed so
14097          * we can use them later without fear of regressing existing binaries.
14098          */
14099         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
14100             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
14101                 dtrace_dof_error(dof, "DOF magic string mismatch");
14102                 return (-1);
14103         }
14104 
14105         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
14106             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
14107                 dtrace_dof_error(dof, "DOF has invalid data model");
14108                 return (-1);
14109         }
14110 
14111         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
14112                 dtrace_dof_error(dof, "DOF encoding mismatch");
14113                 return (-1);
14114         }
14115 
14116         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14117             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
14118                 dtrace_dof_error(dof, "DOF version mismatch");
14119                 return (-1);
14120         }
14121 
14122         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
14123                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
14124                 return (-1);
14125         }
14126 
14127         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
14128                 dtrace_dof_error(dof, "DOF uses too many integer registers");
14129                 return (-1);
14130         }
14131 
14132         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
14133                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
14134                 return (-1);
14135         }
14136 
14137         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
14138                 if (dof->dofh_ident[i] != 0) {
14139                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
14140                         return (-1);
14141                 }
14142         }
14143 
14144         if (dof->dofh_flags & ~DOF_FL_VALID) {
14145                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
14146                 return (-1);
14147         }
14148 
14149         if (dof->dofh_secsize == 0) {
14150                 dtrace_dof_error(dof, "zero section header size");
14151                 return (-1);
14152         }
14153 
14154         /*
14155          * Check that the section headers don't exceed the amount of DOF
14156          * data.  Note that we cast the section size and number of sections
14157          * to uint64_t's to prevent possible overflow in the multiplication.
14158          */
14159         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
14160 
14161         if (dof->dofh_secoff > len || seclen > len ||
14162             dof->dofh_secoff + seclen > len) {
14163                 dtrace_dof_error(dof, "truncated section headers");
14164                 return (-1);
14165         }
14166 
14167         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
14168                 dtrace_dof_error(dof, "misaligned section headers");
14169                 return (-1);
14170         }
14171 
14172         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
14173                 dtrace_dof_error(dof, "misaligned section size");
14174                 return (-1);
14175         }
14176 
14177         /*
14178          * Take an initial pass through the section headers to be sure that
14179          * the headers don't have stray offsets.  If the 'noprobes' flag is
14180          * set, do not permit sections relating to providers, probes, or args.
14181          */
14182         for (i = 0; i < dof->dofh_secnum; i++) {
14183                 dof_sec_t *sec = (dof_sec_t *)(daddr +
14184                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14185 
14186                 if (noprobes) {
14187                         switch (sec->dofs_type) {
14188                         case DOF_SECT_PROVIDER:
14189                         case DOF_SECT_PROBES:
14190                         case DOF_SECT_PRARGS:
14191                         case DOF_SECT_PROFFS:
14192                                 dtrace_dof_error(dof, "illegal sections "
14193                                     "for enabling");
14194                                 return (-1);
14195                         }
14196                 }
14197 
14198                 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
14199                     !(sec->dofs_flags & DOF_SECF_LOAD)) {
14200                         dtrace_dof_error(dof, "loadable section with load "
14201                             "flag unset");
14202                         return (-1);
14203                 }
14204 
14205                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
14206                         continue; /* just ignore non-loadable sections */
14207 
14208                 if (!ISP2(sec->dofs_align)) {
14209                         dtrace_dof_error(dof, "bad section alignment");
14210                         return (-1);
14211                 }
14212 
14213                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
14214                         dtrace_dof_error(dof, "misaligned section");
14215                         return (-1);
14216                 }
14217 
14218                 if (sec->dofs_offset > len || sec->dofs_size > len ||
14219                     sec->dofs_offset + sec->dofs_size > len) {
14220                         dtrace_dof_error(dof, "corrupt section header");
14221                         return (-1);
14222                 }
14223 
14224                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
14225                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
14226                         dtrace_dof_error(dof, "non-terminating string table");
14227                         return (-1);
14228                 }
14229         }
14230 
14231         /*
14232          * Take a second pass through the sections and locate and perform any
14233          * relocations that are present.  We do this after the first pass to
14234          * be sure that all sections have had their headers validated.
14235          */
14236         for (i = 0; i < dof->dofh_secnum; i++) {
14237                 dof_sec_t *sec = (dof_sec_t *)(daddr +
14238                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14239 
14240                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
14241                         continue; /* skip sections that are not loadable */
14242 
14243                 switch (sec->dofs_type) {
14244                 case DOF_SECT_URELHDR:
14245                         if (dtrace_dof_relocate(dof, sec, ubase, udaddr) != 0)
14246                                 return (-1);
14247                         break;
14248                 }
14249         }
14250 
14251         if ((enab = *enabp) == NULL)
14252                 enab = *enabp = dtrace_enabling_create(vstate);
14253 
14254         for (i = 0; i < dof->dofh_secnum; i++) {
14255                 dof_sec_t *sec = (dof_sec_t *)(daddr +
14256                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14257 
14258                 if (sec->dofs_type != DOF_SECT_ECBDESC)
14259                         continue;
14260 
14261                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
14262                         dtrace_enabling_destroy(enab);
14263                         *enabp = NULL;
14264                         return (-1);
14265                 }
14266 
14267                 dtrace_enabling_add(enab, ep);
14268         }
14269 
14270         return (0);
14271 }
14272 
14273 /*
14274  * Process DOF for any options.  This routine assumes that the DOF has been
14275  * at least processed by dtrace_dof_slurp().
14276  */
14277 static int
14278 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
14279 {
14280         int i, rval;
14281         uint32_t entsize;
14282         size_t offs;
14283         dof_optdesc_t *desc;
14284 
14285         for (i = 0; i < dof->dofh_secnum; i++) {
14286                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
14287                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
14288 
14289                 if (sec->dofs_type != DOF_SECT_OPTDESC)
14290                         continue;
14291 
14292                 if (sec->dofs_align != sizeof (uint64_t)) {
14293                         dtrace_dof_error(dof, "bad alignment in "
14294                             "option description");
14295                         return (EINVAL);
14296                 }
14297 
14298                 if ((entsize = sec->dofs_entsize) == 0) {
14299                         dtrace_dof_error(dof, "zeroed option entry size");
14300                         return (EINVAL);
14301                 }
14302 
14303                 if (entsize < sizeof (dof_optdesc_t)) {
14304                         dtrace_dof_error(dof, "bad option entry size");
14305                         return (EINVAL);
14306                 }
14307 
14308                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
14309                         desc = (dof_optdesc_t *)((uintptr_t)dof +
14310                             (uintptr_t)sec->dofs_offset + offs);
14311 
14312                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
14313                                 dtrace_dof_error(dof, "non-zero option string");
14314                                 return (EINVAL);
14315                         }
14316 
14317                         if (desc->dofo_value == DTRACEOPT_UNSET) {
14318                                 dtrace_dof_error(dof, "unset option");
14319                                 return (EINVAL);
14320                         }
14321 
14322                         if ((rval = dtrace_state_option(state,
14323                             desc->dofo_option, desc->dofo_value)) != 0) {
14324                                 dtrace_dof_error(dof, "rejected option");
14325                                 return (rval);
14326                         }
14327                 }
14328         }
14329 
14330         return (0);
14331 }
14332 
14333 /*
14334  * DTrace Consumer State Functions
14335  */
14336 static int
14337 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
14338 {
14339         size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
14340         void *base;
14341         uintptr_t limit;
14342         dtrace_dynvar_t *dvar, *next, *start;
14343         int i;
14344 
14345         ASSERT(MUTEX_HELD(&dtrace_lock));
14346         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
14347 
14348         bzero(dstate, sizeof (dtrace_dstate_t));
14349 
14350         if ((dstate->dtds_chunksize = chunksize) == 0)
14351                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
14352 
14353         VERIFY(dstate->dtds_chunksize < LONG_MAX);
14354 
14355         if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
14356                 size = min;
14357 
14358         if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
14359                 return (ENOMEM);
14360 
14361         dstate->dtds_size = size;
14362         dstate->dtds_base = base;
14363         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
14364         bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
14365 
14366         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
14367 
14368         if (hashsize != 1 && (hashsize & 1))
14369                 hashsize--;
14370 
14371         dstate->dtds_hashsize = hashsize;
14372         dstate->dtds_hash = dstate->dtds_base;
14373 
14374         /*
14375          * Set all of our hash buckets to point to the single sink, and (if
14376          * it hasn't already been set), set the sink's hash value to be the
14377          * sink sentinel value.  The sink is needed for dynamic variable
14378          * lookups to know that they have iterated over an entire, valid hash
14379          * chain.
14380          */
14381         for (i = 0; i < hashsize; i++)
14382                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
14383 
14384         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
14385                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
14386 
14387         /*
14388          * Determine number of active CPUs.  Divide free list evenly among
14389          * active CPUs.
14390          */
14391         start = (dtrace_dynvar_t *)
14392             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
14393         limit = (uintptr_t)base + size;
14394 
14395         VERIFY((uintptr_t)start < limit);
14396         VERIFY((uintptr_t)start >= (uintptr_t)base);
14397 
14398         maxper = (limit - (uintptr_t)start) / NCPU;
14399         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
14400 
14401 #ifndef illumos
14402         CPU_FOREACH(i) {
14403 #else
14404         for (i = 0; i < NCPU; i++) {
14405 #endif
14406                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
14407 
14408                 /*
14409                  * If we don't even have enough chunks to make it once through
14410                  * NCPUs, we're just going to allocate everything to the first
14411                  * CPU.  And if we're on the last CPU, we're going to allocate
14412                  * whatever is left over.  In either case, we set the limit to
14413                  * be the limit of the dynamic variable space.
14414                  */
14415                 if (maxper == 0 || i == NCPU - 1) {
14416                         limit = (uintptr_t)base + size;
14417                         start = NULL;
14418                 } else {
14419                         limit = (uintptr_t)start + maxper;
14420                         start = (dtrace_dynvar_t *)limit;
14421                 }
14422 
14423                 VERIFY(limit <= (uintptr_t)base + size);
14424 
14425                 for (;;) {
14426                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14427                             dstate->dtds_chunksize);
14428 
14429                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14430                                 break;
14431 
14432                         VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
14433                             (uintptr_t)dvar <= (uintptr_t)base + size);
14434                         dvar->dtdv_next = next;
14435                         dvar = next;
14436                 }
14437 
14438                 if (maxper == 0)
14439                         break;
14440         }
14441 
14442         return (0);
14443 }
14444 
14445 static void
14446 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14447 {
14448         ASSERT(MUTEX_HELD(&cpu_lock));
14449 
14450         if (dstate->dtds_base == NULL)
14451                 return;
14452 
14453         kmem_free(dstate->dtds_base, dstate->dtds_size);
14454         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14455 }
14456 
14457 static void
14458 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14459 {
14460         /*
14461          * Logical XOR, where are you?
14462          */
14463         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14464 
14465         if (vstate->dtvs_nglobals > 0) {
14466                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14467                     sizeof (dtrace_statvar_t *));
14468         }
14469 
14470         if (vstate->dtvs_ntlocals > 0) {
14471                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14472                     sizeof (dtrace_difv_t));
14473         }
14474 
14475         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14476 
14477         if (vstate->dtvs_nlocals > 0) {
14478                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14479                     sizeof (dtrace_statvar_t *));
14480         }
14481 }
14482 
14483 #ifdef illumos
14484 static void
14485 dtrace_state_clean(dtrace_state_t *state)
14486 {
14487         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14488                 return;
14489 
14490         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14491         dtrace_speculation_clean(state);
14492 }
14493 
14494 static void
14495 dtrace_state_deadman(dtrace_state_t *state)
14496 {
14497         hrtime_t now;
14498 
14499         dtrace_sync();
14500 
14501         now = dtrace_gethrtime();
14502 
14503         if (state != dtrace_anon.dta_state &&
14504             now - state->dts_laststatus >= dtrace_deadman_user)
14505                 return;
14506 
14507         /*
14508          * We must be sure that dts_alive never appears to be less than the
14509          * value upon entry to dtrace_state_deadman(), and because we lack a
14510          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14511          * store INT64_MAX to it, followed by a memory barrier, followed by
14512          * the new value.  This assures that dts_alive never appears to be
14513          * less than its true value, regardless of the order in which the
14514          * stores to the underlying storage are issued.
14515          */
14516         state->dts_alive = INT64_MAX;
14517         dtrace_membar_producer();
14518         state->dts_alive = now;
14519 }
14520 #else   /* !illumos */
14521 static void
14522 dtrace_state_clean(void *arg)
14523 {
14524         dtrace_state_t *state = arg;
14525         dtrace_optval_t *opt = state->dts_options;
14526 
14527         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14528                 return;
14529 
14530         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14531         dtrace_speculation_clean(state);
14532 
14533         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14534             dtrace_state_clean, state);
14535 }
14536 
14537 static void
14538 dtrace_state_deadman(void *arg)
14539 {
14540         dtrace_state_t *state = arg;
14541         hrtime_t now;
14542 
14543         dtrace_sync();
14544 
14545         dtrace_debug_output();
14546 
14547         now = dtrace_gethrtime();
14548 
14549         if (state != dtrace_anon.dta_state &&
14550             now - state->dts_laststatus >= dtrace_deadman_user)
14551                 return;
14552 
14553         /*
14554          * We must be sure that dts_alive never appears to be less than the
14555          * value upon entry to dtrace_state_deadman(), and because we lack a
14556          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14557          * store INT64_MAX to it, followed by a memory barrier, followed by
14558          * the new value.  This assures that dts_alive never appears to be
14559          * less than its true value, regardless of the order in which the
14560          * stores to the underlying storage are issued.
14561          */
14562         state->dts_alive = INT64_MAX;
14563         dtrace_membar_producer();
14564         state->dts_alive = now;
14565 
14566         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14567             dtrace_state_deadman, state);
14568 }
14569 #endif  /* illumos */
14570 
14571 static dtrace_state_t *
14572 #ifdef illumos
14573 dtrace_state_create(dev_t *devp, cred_t *cr)
14574 #else
14575 dtrace_state_create(struct cdev *dev, struct ucred *cred __unused)
14576 #endif
14577 {
14578 #ifdef illumos
14579         minor_t minor;
14580         major_t major;
14581 #else
14582         cred_t *cr = NULL;
14583         int m = 0;
14584 #endif
14585         char c[30];
14586         dtrace_state_t *state;
14587         dtrace_optval_t *opt;
14588         int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14589         int cpu_it;
14590 
14591         ASSERT(MUTEX_HELD(&dtrace_lock));
14592         ASSERT(MUTEX_HELD(&cpu_lock));
14593 
14594 #ifdef illumos
14595         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14596             VM_BESTFIT | VM_SLEEP);
14597 
14598         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14599                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14600                 return (NULL);
14601         }
14602 
14603         state = ddi_get_soft_state(dtrace_softstate, minor);
14604 #else
14605         if (dev != NULL) {
14606                 cr = dev->si_cred;
14607                 m = dev2unit(dev);
14608         }
14609 
14610         /* Allocate memory for the state. */
14611         state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14612 #endif
14613 
14614         state->dts_epid = DTRACE_EPIDNONE + 1;
14615 
14616         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14617 #ifdef illumos
14618         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14619             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14620 
14621         if (devp != NULL) {
14622                 major = getemajor(*devp);
14623         } else {
14624                 major = ddi_driver_major(dtrace_devi);
14625         }
14626 
14627         state->dts_dev = makedevice(major, minor);
14628 
14629         if (devp != NULL)
14630                 *devp = state->dts_dev;
14631 #else
14632         state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14633         state->dts_dev = dev;
14634 #endif
14635 
14636         /*
14637          * We allocate NCPU buffers.  On the one hand, this can be quite
14638          * a bit of memory per instance (nearly 36K on a Starcat).  On the
14639          * other hand, it saves an additional memory reference in the probe
14640          * path.
14641          */
14642         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14643         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14644 
14645         /*
14646          * Allocate and initialise the per-process per-CPU random state.
14647          * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14648          * assumed to be seeded at this point (if from Fortuna seed file).
14649          */
14650         arc4random_buf(&state->dts_rstate[0], 2 * sizeof(uint64_t));
14651         for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14652                 /*
14653                  * Each CPU is assigned a 2^64 period, non-overlapping
14654                  * subsequence.
14655                  */
14656                 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14657                     state->dts_rstate[cpu_it]); 
14658         }
14659 
14660 #ifdef illumos
14661         state->dts_cleaner = CYCLIC_NONE;
14662         state->dts_deadman = CYCLIC_NONE;
14663 #else
14664         callout_init(&state->dts_cleaner, 1);
14665         callout_init(&state->dts_deadman, 1);
14666 #endif
14667         state->dts_vstate.dtvs_state = state;
14668 
14669         for (i = 0; i < DTRACEOPT_MAX; i++)
14670                 state->dts_options[i] = DTRACEOPT_UNSET;
14671 
14672         /*
14673          * Set the default options.
14674          */
14675         opt = state->dts_options;
14676         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14677         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14678         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14679         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14680         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14681         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14682         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14683         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14684         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14685         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14686         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14687         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14688         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14689         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14690 
14691         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14692 
14693         /*
14694          * Depending on the user credentials, we set flag bits which alter probe
14695          * visibility or the amount of destructiveness allowed.  In the case of
14696          * actual anonymous tracing, or the possession of all privileges, all of
14697          * the normal checks are bypassed.
14698          */
14699         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14700                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14701                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14702         } else {
14703                 /*
14704                  * Set up the credentials for this instantiation.  We take a
14705                  * hold on the credential to prevent it from disappearing on
14706                  * us; this in turn prevents the zone_t referenced by this
14707                  * credential from disappearing.  This means that we can
14708                  * examine the credential and the zone from probe context.
14709                  */
14710                 crhold(cr);
14711                 state->dts_cred.dcr_cred = cr;
14712 
14713                 /*
14714                  * CRA_PROC means "we have *some* privilege for dtrace" and
14715                  * unlocks the use of variables like pid, zonename, etc.
14716                  */
14717                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14718                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14719                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14720                 }
14721 
14722                 /*
14723                  * dtrace_user allows use of syscall and profile providers.
14724                  * If the user also has proc_owner and/or proc_zone, we
14725                  * extend the scope to include additional visibility and
14726                  * destructive power.
14727                  */
14728                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14729                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14730                                 state->dts_cred.dcr_visible |=
14731                                     DTRACE_CRV_ALLPROC;
14732 
14733                                 state->dts_cred.dcr_action |=
14734                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14735                         }
14736 
14737                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14738                                 state->dts_cred.dcr_visible |=
14739                                     DTRACE_CRV_ALLZONE;
14740 
14741                                 state->dts_cred.dcr_action |=
14742                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14743                         }
14744 
14745                         /*
14746                          * If we have all privs in whatever zone this is,
14747                          * we can do destructive things to processes which
14748                          * have altered credentials.
14749                          */
14750 #ifdef illumos
14751                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14752                             cr->cr_zone->zone_privset)) {
14753                                 state->dts_cred.dcr_action |=
14754                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14755                         }
14756 #endif
14757                 }
14758 
14759                 /*
14760                  * Holding the dtrace_kernel privilege also implies that
14761                  * the user has the dtrace_user privilege from a visibility
14762                  * perspective.  But without further privileges, some
14763                  * destructive actions are not available.
14764                  */
14765                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14766                         /*
14767                          * Make all probes in all zones visible.  However,
14768                          * this doesn't mean that all actions become available
14769                          * to all zones.
14770                          */
14771                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14772                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14773 
14774                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14775                             DTRACE_CRA_PROC;
14776                         /*
14777                          * Holding proc_owner means that destructive actions
14778                          * for *this* zone are allowed.
14779                          */
14780                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14781                                 state->dts_cred.dcr_action |=
14782                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14783 
14784                         /*
14785                          * Holding proc_zone means that destructive actions
14786                          * for this user/group ID in all zones is allowed.
14787                          */
14788                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14789                                 state->dts_cred.dcr_action |=
14790                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14791 
14792 #ifdef illumos
14793                         /*
14794                          * If we have all privs in whatever zone this is,
14795                          * we can do destructive things to processes which
14796                          * have altered credentials.
14797                          */
14798                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14799                             cr->cr_zone->zone_privset)) {
14800                                 state->dts_cred.dcr_action |=
14801                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14802                         }
14803 #endif
14804                 }
14805 
14806                 /*
14807                  * Holding the dtrace_proc privilege gives control over fasttrap
14808                  * and pid providers.  We need to grant wider destructive
14809                  * privileges in the event that the user has proc_owner and/or
14810                  * proc_zone.
14811                  */
14812                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14813                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14814                                 state->dts_cred.dcr_action |=
14815                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14816 
14817                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14818                                 state->dts_cred.dcr_action |=
14819                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14820                 }
14821         }
14822 
14823         return (state);
14824 }
14825 
14826 static int
14827 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14828 {
14829         dtrace_optval_t *opt = state->dts_options, size;
14830         processorid_t cpu = 0;
14831         int flags = 0, rval, factor, divisor = 1;
14832 
14833         ASSERT(MUTEX_HELD(&dtrace_lock));
14834         ASSERT(MUTEX_HELD(&cpu_lock));
14835         ASSERT(which < DTRACEOPT_MAX);
14836         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14837             (state == dtrace_anon.dta_state &&
14838             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14839 
14840         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14841                 return (0);
14842 
14843         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14844                 cpu = opt[DTRACEOPT_CPU];
14845 
14846         if (which == DTRACEOPT_SPECSIZE)
14847                 flags |= DTRACEBUF_NOSWITCH;
14848 
14849         if (which == DTRACEOPT_BUFSIZE) {
14850                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14851                         flags |= DTRACEBUF_RING;
14852 
14853                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14854                         flags |= DTRACEBUF_FILL;
14855 
14856                 if (state != dtrace_anon.dta_state ||
14857                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14858                         flags |= DTRACEBUF_INACTIVE;
14859         }
14860 
14861         for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14862                 /*
14863                  * The size must be 8-byte aligned.  If the size is not 8-byte
14864                  * aligned, drop it down by the difference.
14865                  */
14866                 if (size & (sizeof (uint64_t) - 1))
14867                         size -= size & (sizeof (uint64_t) - 1);
14868 
14869                 if (size < state->dts_reserve) {
14870                         /*
14871                          * Buffers always must be large enough to accommodate
14872                          * their prereserved space.  We return E2BIG instead
14873                          * of ENOMEM in this case to allow for user-level
14874                          * software to differentiate the cases.
14875                          */
14876                         return (E2BIG);
14877                 }
14878 
14879                 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14880 
14881                 if (rval != ENOMEM) {
14882                         opt[which] = size;
14883                         return (rval);
14884                 }
14885 
14886                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14887                         return (rval);
14888 
14889                 for (divisor = 2; divisor < factor; divisor <<= 1)
14890                         continue;
14891         }
14892 
14893         return (ENOMEM);
14894 }
14895 
14896 static int
14897 dtrace_state_buffers(dtrace_state_t *state)
14898 {
14899         dtrace_speculation_t *spec = state->dts_speculations;
14900         int rval, i;
14901 
14902         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14903             DTRACEOPT_BUFSIZE)) != 0)
14904                 return (rval);
14905 
14906         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14907             DTRACEOPT_AGGSIZE)) != 0)
14908                 return (rval);
14909 
14910         for (i = 0; i < state->dts_nspeculations; i++) {
14911                 if ((rval = dtrace_state_buffer(state,
14912                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14913                         return (rval);
14914         }
14915 
14916         return (0);
14917 }
14918 
14919 static void
14920 dtrace_state_prereserve(dtrace_state_t *state)
14921 {
14922         dtrace_ecb_t *ecb;
14923         dtrace_probe_t *probe;
14924 
14925         state->dts_reserve = 0;
14926 
14927         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14928                 return;
14929 
14930         /*
14931          * If our buffer policy is a "fill" buffer policy, we need to set the
14932          * prereserved space to be the space required by the END probes.
14933          */
14934         probe = dtrace_probes[dtrace_probeid_end - 1];
14935         ASSERT(probe != NULL);
14936 
14937         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14938                 if (ecb->dte_state != state)
14939                         continue;
14940 
14941                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14942         }
14943 }
14944 
14945 static int
14946 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14947 {
14948         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14949         dtrace_speculation_t *spec;
14950         dtrace_buffer_t *buf;
14951 #ifdef illumos
14952         cyc_handler_t hdlr;
14953         cyc_time_t when;
14954 #endif
14955         int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14956         dtrace_icookie_t cookie;
14957 
14958         mutex_enter(&cpu_lock);
14959         mutex_enter(&dtrace_lock);
14960 
14961         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14962                 rval = EBUSY;
14963                 goto out;
14964         }
14965 
14966         /*
14967          * Before we can perform any checks, we must prime all of the
14968          * retained enablings that correspond to this state.
14969          */
14970         dtrace_enabling_prime(state);
14971 
14972         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14973                 rval = EACCES;
14974                 goto out;
14975         }
14976 
14977         dtrace_state_prereserve(state);
14978 
14979         /*
14980          * Now we want to do is try to allocate our speculations.
14981          * We do not automatically resize the number of speculations; if
14982          * this fails, we will fail the operation.
14983          */
14984         nspec = opt[DTRACEOPT_NSPEC];
14985         ASSERT(nspec != DTRACEOPT_UNSET);
14986 
14987         if (nspec > INT_MAX) {
14988                 rval = ENOMEM;
14989                 goto out;
14990         }
14991 
14992         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14993             KM_NOSLEEP | KM_NORMALPRI);
14994 
14995         if (spec == NULL) {
14996                 rval = ENOMEM;
14997                 goto out;
14998         }
14999 
15000         state->dts_speculations = spec;
15001         state->dts_nspeculations = (int)nspec;
15002 
15003         for (i = 0; i < nspec; i++) {
15004                 if ((buf = kmem_zalloc(bufsize,
15005                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
15006                         rval = ENOMEM;
15007                         goto err;
15008                 }
15009 
15010                 spec[i].dtsp_buffer = buf;
15011         }
15012 
15013         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
15014                 if (dtrace_anon.dta_state == NULL) {
15015                         rval = ENOENT;
15016                         goto out;
15017                 }
15018 
15019                 if (state->dts_necbs != 0) {
15020                         rval = EALREADY;
15021                         goto out;
15022                 }
15023 
15024                 state->dts_anon = dtrace_anon_grab();
15025                 ASSERT(state->dts_anon != NULL);
15026                 state = state->dts_anon;
15027 
15028                 /*
15029                  * We want "grabanon" to be set in the grabbed state, so we'll
15030                  * copy that option value from the grabbing state into the
15031                  * grabbed state.
15032                  */
15033                 state->dts_options[DTRACEOPT_GRABANON] =
15034                     opt[DTRACEOPT_GRABANON];
15035 
15036                 *cpu = dtrace_anon.dta_beganon;
15037 
15038                 /*
15039                  * If the anonymous state is active (as it almost certainly
15040                  * is if the anonymous enabling ultimately matched anything),
15041                  * we don't allow any further option processing -- but we
15042                  * don't return failure.
15043                  */
15044                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
15045                         goto out;
15046         }
15047 
15048         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
15049             opt[DTRACEOPT_AGGSIZE] != 0) {
15050                 if (state->dts_aggregations == NULL) {
15051                         /*
15052                          * We're not going to create an aggregation buffer
15053                          * because we don't have any ECBs that contain
15054                          * aggregations -- set this option to 0.
15055                          */
15056                         opt[DTRACEOPT_AGGSIZE] = 0;
15057                 } else {
15058                         /*
15059                          * If we have an aggregation buffer, we must also have
15060                          * a buffer to use as scratch.
15061                          */
15062                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
15063                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
15064                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
15065                         }
15066                 }
15067         }
15068 
15069         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
15070             opt[DTRACEOPT_SPECSIZE] != 0) {
15071                 if (!state->dts_speculates) {
15072                         /*
15073                          * We're not going to create speculation buffers
15074                          * because we don't have any ECBs that actually
15075                          * speculate -- set the speculation size to 0.
15076                          */
15077                         opt[DTRACEOPT_SPECSIZE] = 0;
15078                 }
15079         }
15080 
15081         /*
15082          * The bare minimum size for any buffer that we're actually going to
15083          * do anything to is sizeof (uint64_t).
15084          */
15085         sz = sizeof (uint64_t);
15086 
15087         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
15088             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
15089             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
15090                 /*
15091                  * A buffer size has been explicitly set to 0 (or to a size
15092                  * that will be adjusted to 0) and we need the space -- we
15093                  * need to return failure.  We return ENOSPC to differentiate
15094                  * it from failing to allocate a buffer due to failure to meet
15095                  * the reserve (for which we return E2BIG).
15096                  */
15097                 rval = ENOSPC;
15098                 goto out;
15099         }
15100 
15101         if ((rval = dtrace_state_buffers(state)) != 0)
15102                 goto err;
15103 
15104         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
15105                 sz = dtrace_dstate_defsize;
15106 
15107         do {
15108                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
15109 
15110                 if (rval == 0)
15111                         break;
15112 
15113                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
15114                         goto err;
15115         } while (sz >>= 1);
15116 
15117         opt[DTRACEOPT_DYNVARSIZE] = sz;
15118 
15119         if (rval != 0)
15120                 goto err;
15121 
15122         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
15123                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
15124 
15125         if (opt[DTRACEOPT_CLEANRATE] == 0)
15126                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
15127 
15128         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
15129                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
15130 
15131         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
15132                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
15133 
15134         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
15135 #ifdef illumos
15136         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
15137         hdlr.cyh_arg = state;
15138         hdlr.cyh_level = CY_LOW_LEVEL;
15139 
15140         when.cyt_when = 0;
15141         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
15142 
15143         state->dts_cleaner = cyclic_add(&hdlr, &when);
15144 
15145         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
15146         hdlr.cyh_arg = state;
15147         hdlr.cyh_level = CY_LOW_LEVEL;
15148 
15149         when.cyt_when = 0;
15150         when.cyt_interval = dtrace_deadman_interval;
15151 
15152         state->dts_deadman = cyclic_add(&hdlr, &when);
15153 #else
15154         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
15155             dtrace_state_clean, state);
15156         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
15157             dtrace_state_deadman, state);
15158 #endif
15159 
15160         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
15161 
15162 #ifdef illumos
15163         if (state->dts_getf != 0 &&
15164             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
15165                 /*
15166                  * We don't have kernel privs but we have at least one call
15167                  * to getf(); we need to bump our zone's count, and (if
15168                  * this is the first enabling to have an unprivileged call
15169                  * to getf()) we need to hook into closef().
15170                  */
15171                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
15172 
15173                 if (dtrace_getf++ == 0) {
15174                         ASSERT(dtrace_closef == NULL);
15175                         dtrace_closef = dtrace_getf_barrier;
15176                 }
15177         }
15178 #endif
15179 
15180         /*
15181          * Now it's time to actually fire the BEGIN probe.  We need to disable
15182          * interrupts here both to record the CPU on which we fired the BEGIN
15183          * probe (the data from this CPU will be processed first at user
15184          * level) and to manually activate the buffer for this CPU.
15185          */
15186         cookie = dtrace_interrupt_disable();
15187         *cpu = curcpu;
15188         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
15189         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
15190 
15191         dtrace_probe(dtrace_probeid_begin,
15192             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
15193         dtrace_interrupt_enable(cookie);
15194         /*
15195          * We may have had an exit action from a BEGIN probe; only change our
15196          * state to ACTIVE if we're still in WARMUP.
15197          */
15198         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
15199             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
15200 
15201         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
15202                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
15203 
15204 #ifdef __FreeBSD__
15205         /*
15206          * We enable anonymous tracing before APs are started, so we must
15207          * activate buffers using the current CPU.
15208          */
15209         if (state == dtrace_anon.dta_state)
15210                 for (int i = 0; i < NCPU; i++)
15211                         dtrace_buffer_activate_cpu(state, i);
15212         else
15213                 dtrace_xcall(DTRACE_CPUALL,
15214                     (dtrace_xcall_t)dtrace_buffer_activate, state);
15215 #else
15216         /*
15217          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
15218          * want each CPU to transition its principal buffer out of the
15219          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
15220          * processing an ECB halfway down a probe's ECB chain; all CPUs will
15221          * atomically transition from processing none of a state's ECBs to
15222          * processing all of them.
15223          */
15224         dtrace_xcall(DTRACE_CPUALL,
15225             (dtrace_xcall_t)dtrace_buffer_activate, state);
15226 #endif
15227         goto out;
15228 
15229 err:
15230         dtrace_buffer_free(state->dts_buffer);
15231         dtrace_buffer_free(state->dts_aggbuffer);
15232 
15233         if ((nspec = state->dts_nspeculations) == 0) {
15234                 ASSERT(state->dts_speculations == NULL);
15235                 goto out;
15236         }
15237 
15238         spec = state->dts_speculations;
15239         ASSERT(spec != NULL);
15240 
15241         for (i = 0; i < state->dts_nspeculations; i++) {
15242                 if ((buf = spec[i].dtsp_buffer) == NULL)
15243                         break;
15244 
15245                 dtrace_buffer_free(buf);
15246                 kmem_free(buf, bufsize);
15247         }
15248 
15249         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15250         state->dts_nspeculations = 0;
15251         state->dts_speculations = NULL;
15252 
15253 out:
15254         mutex_exit(&dtrace_lock);
15255         mutex_exit(&cpu_lock);
15256 
15257         return (rval);
15258 }
15259 
15260 static int
15261 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
15262 {
15263         dtrace_icookie_t cookie;
15264 
15265         ASSERT(MUTEX_HELD(&dtrace_lock));
15266 
15267         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
15268             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
15269                 return (EINVAL);
15270 
15271         /*
15272          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
15273          * to be sure that every CPU has seen it.  See below for the details
15274          * on why this is done.
15275          */
15276         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
15277         dtrace_sync();
15278 
15279         /*
15280          * By this point, it is impossible for any CPU to be still processing
15281          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
15282          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
15283          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
15284          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
15285          * iff we're in the END probe.
15286          */
15287         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
15288         dtrace_sync();
15289         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
15290 
15291         /*
15292          * Finally, we can release the reserve and call the END probe.  We
15293          * disable interrupts across calling the END probe to allow us to
15294          * return the CPU on which we actually called the END probe.  This
15295          * allows user-land to be sure that this CPU's principal buffer is
15296          * processed last.
15297          */
15298         state->dts_reserve = 0;
15299 
15300         cookie = dtrace_interrupt_disable();
15301         *cpu = curcpu;
15302         dtrace_probe(dtrace_probeid_end,
15303             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
15304         dtrace_interrupt_enable(cookie);
15305 
15306         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
15307         dtrace_sync();
15308 
15309 #ifdef illumos
15310         if (state->dts_getf != 0 &&
15311             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
15312                 /*
15313                  * We don't have kernel privs but we have at least one call
15314                  * to getf(); we need to lower our zone's count, and (if
15315                  * this is the last enabling to have an unprivileged call
15316                  * to getf()) we need to clear the closef() hook.
15317                  */
15318                 ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
15319                 ASSERT(dtrace_closef == dtrace_getf_barrier);
15320                 ASSERT(dtrace_getf > 0);
15321 
15322                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
15323 
15324                 if (--dtrace_getf == 0)
15325                         dtrace_closef = NULL;
15326         }
15327 #endif
15328 
15329         return (0);
15330 }
15331 
15332 static int
15333 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
15334     dtrace_optval_t val)
15335 {
15336         ASSERT(MUTEX_HELD(&dtrace_lock));
15337 
15338         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
15339                 return (EBUSY);
15340 
15341         if (option >= DTRACEOPT_MAX)
15342                 return (EINVAL);
15343 
15344         if (option != DTRACEOPT_CPU && val < 0)
15345                 return (EINVAL);
15346 
15347         switch (option) {
15348         case DTRACEOPT_DESTRUCTIVE:
15349                 if (dtrace_destructive_disallow)
15350                         return (EACCES);
15351 
15352                 state->dts_cred.dcr_destructive = 1;
15353                 break;
15354 
15355         case DTRACEOPT_BUFSIZE:
15356         case DTRACEOPT_DYNVARSIZE:
15357         case DTRACEOPT_AGGSIZE:
15358         case DTRACEOPT_SPECSIZE:
15359         case DTRACEOPT_STRSIZE:
15360                 if (val < 0)
15361                         return (EINVAL);
15362 
15363                 if (val >= LONG_MAX) {
15364                         /*
15365                          * If this is an otherwise negative value, set it to
15366                          * the highest multiple of 128m less than LONG_MAX.
15367                          * Technically, we're adjusting the size without
15368                          * regard to the buffer resizing policy, but in fact,
15369                          * this has no effect -- if we set the buffer size to
15370                          * ~LONG_MAX and the buffer policy is ultimately set to
15371                          * be "manual", the buffer allocation is guaranteed to
15372                          * fail, if only because the allocation requires two
15373                          * buffers.  (We set the the size to the highest
15374                          * multiple of 128m because it ensures that the size
15375                          * will remain a multiple of a megabyte when
15376                          * repeatedly halved -- all the way down to 15m.)
15377                          */
15378                         val = LONG_MAX - (1 << 27) + 1;
15379                 }
15380         }
15381 
15382         state->dts_options[option] = val;
15383 
15384         return (0);
15385 }
15386 
15387 static void
15388 dtrace_state_destroy(dtrace_state_t *state)
15389 {
15390         dtrace_ecb_t *ecb;
15391         dtrace_vstate_t *vstate = &state->dts_vstate;
15392 #ifdef illumos
15393         minor_t minor = getminor(state->dts_dev);
15394 #endif
15395         int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
15396         dtrace_speculation_t *spec = state->dts_speculations;
15397         int nspec = state->dts_nspeculations;
15398         uint32_t match;
15399 
15400         ASSERT(MUTEX_HELD(&dtrace_lock));
15401         ASSERT(MUTEX_HELD(&cpu_lock));
15402 
15403         /*
15404          * First, retract any retained enablings for this state.
15405          */
15406         dtrace_enabling_retract(state);
15407         ASSERT(state->dts_nretained == 0);
15408 
15409         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
15410             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
15411                 /*
15412                  * We have managed to come into dtrace_state_destroy() on a
15413                  * hot enabling -- almost certainly because of a disorderly
15414                  * shutdown of a consumer.  (That is, a consumer that is
15415                  * exiting without having called dtrace_stop().) In this case,
15416                  * we're going to set our activity to be KILLED, and then
15417                  * issue a sync to be sure that everyone is out of probe
15418                  * context before we start blowing away ECBs.
15419                  */
15420                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
15421                 dtrace_sync();
15422         }
15423 
15424         /*
15425          * Release the credential hold we took in dtrace_state_create().
15426          */
15427         if (state->dts_cred.dcr_cred != NULL)
15428                 crfree(state->dts_cred.dcr_cred);
15429 
15430         /*
15431          * Now we can safely disable and destroy any enabled probes.  Because
15432          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
15433          * (especially if they're all enabled), we take two passes through the
15434          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
15435          * in the second we disable whatever is left over.
15436          */
15437         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
15438                 for (i = 0; i < state->dts_necbs; i++) {
15439                         if ((ecb = state->dts_ecbs[i]) == NULL)
15440                                 continue;
15441 
15442                         if (match && ecb->dte_probe != NULL) {
15443                                 dtrace_probe_t *probe = ecb->dte_probe;
15444                                 dtrace_provider_t *prov = probe->dtpr_provider;
15445 
15446                                 if (!(prov->dtpv_priv.dtpp_flags & match))
15447                                         continue;
15448                         }
15449 
15450                         dtrace_ecb_disable(ecb);
15451                         dtrace_ecb_destroy(ecb);
15452                 }
15453 
15454                 if (!match)
15455                         break;
15456         }
15457 
15458         /*
15459          * Before we free the buffers, perform one more sync to assure that
15460          * every CPU is out of probe context.
15461          */
15462         dtrace_sync();
15463 
15464         dtrace_buffer_free(state->dts_buffer);
15465         dtrace_buffer_free(state->dts_aggbuffer);
15466 
15467         for (i = 0; i < nspec; i++)
15468                 dtrace_buffer_free(spec[i].dtsp_buffer);
15469 
15470 #ifdef illumos
15471         if (state->dts_cleaner != CYCLIC_NONE)
15472                 cyclic_remove(state->dts_cleaner);
15473 
15474         if (state->dts_deadman != CYCLIC_NONE)
15475                 cyclic_remove(state->dts_deadman);
15476 #else
15477         callout_stop(&state->dts_cleaner);
15478         callout_drain(&state->dts_cleaner);
15479         callout_stop(&state->dts_deadman);
15480         callout_drain(&state->dts_deadman);
15481 #endif
15482 
15483         dtrace_dstate_fini(&vstate->dtvs_dynvars);
15484         dtrace_vstate_fini(vstate);
15485         if (state->dts_ecbs != NULL)
15486                 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15487 
15488         if (state->dts_aggregations != NULL) {
15489 #ifdef DEBUG
15490                 for (i = 0; i < state->dts_naggregations; i++)
15491                         ASSERT(state->dts_aggregations[i] == NULL);
15492 #endif
15493                 ASSERT(state->dts_naggregations > 0);
15494                 kmem_free(state->dts_aggregations,
15495                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15496         }
15497 
15498         kmem_free(state->dts_buffer, bufsize);
15499         kmem_free(state->dts_aggbuffer, bufsize);
15500 
15501         for (i = 0; i < nspec; i++)
15502                 kmem_free(spec[i].dtsp_buffer, bufsize);
15503 
15504         if (spec != NULL)
15505                 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15506 
15507         dtrace_format_destroy(state);
15508 
15509         if (state->dts_aggid_arena != NULL) {
15510 #ifdef illumos
15511                 vmem_destroy(state->dts_aggid_arena);
15512 #else
15513                 delete_unrhdr(state->dts_aggid_arena);
15514 #endif
15515                 state->dts_aggid_arena = NULL;
15516         }
15517 #ifdef illumos
15518         ddi_soft_state_free(dtrace_softstate, minor);
15519         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15520 #endif
15521 }
15522 
15523 /*
15524  * DTrace Anonymous Enabling Functions
15525  */
15526 static dtrace_state_t *
15527 dtrace_anon_grab(void)
15528 {
15529         dtrace_state_t *state;
15530 
15531         ASSERT(MUTEX_HELD(&dtrace_lock));
15532 
15533         if ((state = dtrace_anon.dta_state) == NULL) {
15534                 ASSERT(dtrace_anon.dta_enabling == NULL);
15535                 return (NULL);
15536         }
15537 
15538         ASSERT(dtrace_anon.dta_enabling != NULL);
15539         ASSERT(dtrace_retained != NULL);
15540 
15541         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15542         dtrace_anon.dta_enabling = NULL;
15543         dtrace_anon.dta_state = NULL;
15544 
15545         return (state);
15546 }
15547 
15548 static void
15549 dtrace_anon_property(void)
15550 {
15551         int i, rv;
15552         dtrace_state_t *state;
15553         dof_hdr_t *dof;
15554         char c[32];             /* enough for "dof-data-" + digits */
15555 
15556         ASSERT(MUTEX_HELD(&dtrace_lock));
15557         ASSERT(MUTEX_HELD(&cpu_lock));
15558 
15559         for (i = 0; ; i++) {
15560                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15561 
15562                 dtrace_err_verbose = 1;
15563 
15564                 if ((dof = dtrace_dof_property(c)) == NULL) {
15565                         dtrace_err_verbose = 0;
15566                         break;
15567                 }
15568 
15569 #ifdef illumos
15570                 /*
15571                  * We want to create anonymous state, so we need to transition
15572                  * the kernel debugger to indicate that DTrace is active.  If
15573                  * this fails (e.g. because the debugger has modified text in
15574                  * some way), we won't continue with the processing.
15575                  */
15576                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15577                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15578                             "enabling ignored.");
15579                         dtrace_dof_destroy(dof);
15580                         break;
15581                 }
15582 #endif
15583 
15584                 /*
15585                  * If we haven't allocated an anonymous state, we'll do so now.
15586                  */
15587                 if ((state = dtrace_anon.dta_state) == NULL) {
15588                         state = dtrace_state_create(NULL, NULL);
15589                         dtrace_anon.dta_state = state;
15590 
15591                         if (state == NULL) {
15592                                 /*
15593                                  * This basically shouldn't happen:  the only
15594                                  * failure mode from dtrace_state_create() is a
15595                                  * failure of ddi_soft_state_zalloc() that
15596                                  * itself should never happen.  Still, the
15597                                  * interface allows for a failure mode, and
15598                                  * we want to fail as gracefully as possible:
15599                                  * we'll emit an error message and cease
15600                                  * processing anonymous state in this case.
15601                                  */
15602                                 cmn_err(CE_WARN, "failed to create "
15603                                     "anonymous state");
15604                                 dtrace_dof_destroy(dof);
15605                                 break;
15606                         }
15607                 }
15608 
15609                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15610                     &dtrace_anon.dta_enabling, 0, 0, B_TRUE);
15611 
15612                 if (rv == 0)
15613                         rv = dtrace_dof_options(dof, state);
15614 
15615                 dtrace_err_verbose = 0;
15616                 dtrace_dof_destroy(dof);
15617 
15618                 if (rv != 0) {
15619                         /*
15620                          * This is malformed DOF; chuck any anonymous state
15621                          * that we created.
15622                          */
15623                         ASSERT(dtrace_anon.dta_enabling == NULL);
15624                         dtrace_state_destroy(state);
15625                         dtrace_anon.dta_state = NULL;
15626                         break;
15627                 }
15628 
15629                 ASSERT(dtrace_anon.dta_enabling != NULL);
15630         }
15631 
15632         if (dtrace_anon.dta_enabling != NULL) {
15633                 int rval;
15634 
15635                 /*
15636                  * dtrace_enabling_retain() can only fail because we are
15637                  * trying to retain more enablings than are allowed -- but
15638                  * we only have one anonymous enabling, and we are guaranteed
15639                  * to be allowed at least one retained enabling; we assert
15640                  * that dtrace_enabling_retain() returns success.
15641                  */
15642                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15643                 ASSERT(rval == 0);
15644 
15645                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15646         }
15647 }
15648 
15649 /*
15650  * DTrace Helper Functions
15651  */
15652 static void
15653 dtrace_helper_trace(dtrace_helper_action_t *helper,
15654     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15655 {
15656         uint32_t size, next, nnext, i;
15657         dtrace_helptrace_t *ent, *buffer;
15658         uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15659 
15660         if ((buffer = dtrace_helptrace_buffer) == NULL)
15661                 return;
15662 
15663         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15664 
15665         /*
15666          * What would a tracing framework be without its own tracing
15667          * framework?  (Well, a hell of a lot simpler, for starters...)
15668          */
15669         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15670             sizeof (uint64_t) - sizeof (uint64_t);
15671 
15672         /*
15673          * Iterate until we can allocate a slot in the trace buffer.
15674          */
15675         do {
15676                 next = dtrace_helptrace_next;
15677 
15678                 if (next + size < dtrace_helptrace_bufsize) {
15679                         nnext = next + size;
15680                 } else {
15681                         nnext = size;
15682                 }
15683         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15684 
15685         /*
15686          * We have our slot; fill it in.
15687          */
15688         if (nnext == size) {
15689                 dtrace_helptrace_wrapped++;
15690                 next = 0;
15691         }
15692 
15693         ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
15694         ent->dtht_helper = helper;
15695         ent->dtht_where = where;
15696         ent->dtht_nlocals = vstate->dtvs_nlocals;
15697 
15698         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15699             mstate->dtms_fltoffs : -1;
15700         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15701         ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15702 
15703         for (i = 0; i < vstate->dtvs_nlocals; i++) {
15704                 dtrace_statvar_t *svar;
15705 
15706                 if ((svar = vstate->dtvs_locals[i]) == NULL)
15707                         continue;
15708 
15709                 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15710                 ent->dtht_locals[i] =
15711                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15712         }
15713 }
15714 
15715 static uint64_t
15716 dtrace_helper(int which, dtrace_mstate_t *mstate,
15717     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15718 {
15719         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15720         uint64_t sarg0 = mstate->dtms_arg[0];
15721         uint64_t sarg1 = mstate->dtms_arg[1];
15722         uint64_t rval = 0;
15723         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15724         dtrace_helper_action_t *helper;
15725         dtrace_vstate_t *vstate;
15726         dtrace_difo_t *pred;
15727         int i, trace = dtrace_helptrace_buffer != NULL;
15728 
15729         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15730 
15731         if (helpers == NULL)
15732                 return (0);
15733 
15734         if ((helper = helpers->dthps_actions[which]) == NULL)
15735                 return (0);
15736 
15737         vstate = &helpers->dthps_vstate;
15738         mstate->dtms_arg[0] = arg0;
15739         mstate->dtms_arg[1] = arg1;
15740 
15741         /*
15742          * Now iterate over each helper.  If its predicate evaluates to 'true',
15743          * we'll call the corresponding actions.  Note that the below calls
15744          * to dtrace_dif_emulate() may set faults in machine state.  This is
15745          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15746          * the stored DIF offset with its own (which is the desired behavior).
15747          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15748          * from machine state; this is okay, too.
15749          */
15750         for (; helper != NULL; helper = helper->dtha_next) {
15751                 if ((pred = helper->dtha_predicate) != NULL) {
15752                         if (trace)
15753                                 dtrace_helper_trace(helper, mstate, vstate, 0);
15754 
15755                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15756                                 goto next;
15757 
15758                         if (*flags & CPU_DTRACE_FAULT)
15759                                 goto err;
15760                 }
15761 
15762                 for (i = 0; i < helper->dtha_nactions; i++) {
15763                         if (trace)
15764                                 dtrace_helper_trace(helper,
15765                                     mstate, vstate, i + 1);
15766 
15767                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
15768                             mstate, vstate, state);
15769 
15770                         if (*flags & CPU_DTRACE_FAULT)
15771                                 goto err;
15772                 }
15773 
15774 next:
15775                 if (trace)
15776                         dtrace_helper_trace(helper, mstate, vstate,
15777                             DTRACE_HELPTRACE_NEXT);
15778         }
15779 
15780         if (trace)
15781                 dtrace_helper_trace(helper, mstate, vstate,
15782                     DTRACE_HELPTRACE_DONE);
15783 
15784         /*
15785          * Restore the arg0 that we saved upon entry.
15786          */
15787         mstate->dtms_arg[0] = sarg0;
15788         mstate->dtms_arg[1] = sarg1;
15789 
15790         return (rval);
15791 
15792 err:
15793         if (trace)
15794                 dtrace_helper_trace(helper, mstate, vstate,
15795                     DTRACE_HELPTRACE_ERR);
15796 
15797         /*
15798          * Restore the arg0 that we saved upon entry.
15799          */
15800         mstate->dtms_arg[0] = sarg0;
15801         mstate->dtms_arg[1] = sarg1;
15802 
15803         return (0);
15804 }
15805 
15806 static void
15807 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15808     dtrace_vstate_t *vstate)
15809 {
15810         int i;
15811 
15812         if (helper->dtha_predicate != NULL)
15813                 dtrace_difo_release(helper->dtha_predicate, vstate);
15814 
15815         for (i = 0; i < helper->dtha_nactions; i++) {
15816                 ASSERT(helper->dtha_actions[i] != NULL);
15817                 dtrace_difo_release(helper->dtha_actions[i], vstate);
15818         }
15819 
15820         kmem_free(helper->dtha_actions,
15821             helper->dtha_nactions * sizeof (dtrace_difo_t *));
15822         kmem_free(helper, sizeof (dtrace_helper_action_t));
15823 }
15824 
15825 static int
15826 dtrace_helper_destroygen(dtrace_helpers_t *help, int gen)
15827 {
15828         proc_t *p = curproc;
15829         dtrace_vstate_t *vstate;
15830         int i;
15831 
15832         if (help == NULL)
15833                 help = p->p_dtrace_helpers;
15834 
15835         ASSERT(MUTEX_HELD(&dtrace_lock));
15836 
15837         if (help == NULL || gen > help->dthps_generation)
15838                 return (EINVAL);
15839 
15840         vstate = &help->dthps_vstate;
15841 
15842         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15843                 dtrace_helper_action_t *last = NULL, *h, *next;
15844 
15845                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15846                         next = h->dtha_next;
15847 
15848                         if (h->dtha_generation == gen) {
15849                                 if (last != NULL) {
15850                                         last->dtha_next = next;
15851                                 } else {
15852                                         help->dthps_actions[i] = next;
15853                                 }
15854 
15855                                 dtrace_helper_action_destroy(h, vstate);
15856                         } else {
15857                                 last = h;
15858                         }
15859                 }
15860         }
15861 
15862         /*
15863          * Interate until we've cleared out all helper providers with the
15864          * given generation number.
15865          */
15866         for (;;) {
15867                 dtrace_helper_provider_t *prov;
15868 
15869                 /*
15870                  * Look for a helper provider with the right generation. We
15871                  * have to start back at the beginning of the list each time
15872                  * because we drop dtrace_lock. It's unlikely that we'll make
15873                  * more than two passes.
15874                  */
15875                 for (i = 0; i < help->dthps_nprovs; i++) {
15876                         prov = help->dthps_provs[i];
15877 
15878                         if (prov->dthp_generation == gen)
15879                                 break;
15880                 }
15881 
15882                 /*
15883                  * If there were no matches, we're done.
15884                  */
15885                 if (i == help->dthps_nprovs)
15886                         break;
15887 
15888                 /*
15889                  * Move the last helper provider into this slot.
15890                  */
15891                 help->dthps_nprovs--;
15892                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15893                 help->dthps_provs[help->dthps_nprovs] = NULL;
15894 
15895                 mutex_exit(&dtrace_lock);
15896 
15897                 /*
15898                  * If we have a meta provider, remove this helper provider.
15899                  */
15900                 mutex_enter(&dtrace_meta_lock);
15901                 if (dtrace_meta_pid != NULL) {
15902                         ASSERT(dtrace_deferred_pid == NULL);
15903                         dtrace_helper_provider_remove(&prov->dthp_prov,
15904                             p->p_pid);
15905                 }
15906                 mutex_exit(&dtrace_meta_lock);
15907 
15908                 dtrace_helper_provider_destroy(prov);
15909 
15910                 mutex_enter(&dtrace_lock);
15911         }
15912 
15913         return (0);
15914 }
15915 
15916 static int
15917 dtrace_helper_validate(dtrace_helper_action_t *helper)
15918 {
15919         int err = 0, i;
15920         dtrace_difo_t *dp;
15921 
15922         if ((dp = helper->dtha_predicate) != NULL)
15923                 err += dtrace_difo_validate_helper(dp);
15924 
15925         for (i = 0; i < helper->dtha_nactions; i++)
15926                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15927 
15928         return (err == 0);
15929 }
15930 
15931 static int
15932 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep,
15933     dtrace_helpers_t *help)
15934 {
15935         dtrace_helper_action_t *helper, *last;
15936         dtrace_actdesc_t *act;
15937         dtrace_vstate_t *vstate;
15938         dtrace_predicate_t *pred;
15939         int count = 0, nactions = 0, i;
15940 
15941         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15942                 return (EINVAL);
15943 
15944         last = help->dthps_actions[which];
15945         vstate = &help->dthps_vstate;
15946 
15947         for (count = 0; last != NULL; last = last->dtha_next) {
15948                 count++;
15949                 if (last->dtha_next == NULL)
15950                         break;
15951         }
15952 
15953         /*
15954          * If we already have dtrace_helper_actions_max helper actions for this
15955          * helper action type, we'll refuse to add a new one.
15956          */
15957         if (count >= dtrace_helper_actions_max)
15958                 return (ENOSPC);
15959 
15960         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15961         helper->dtha_generation = help->dthps_generation;
15962 
15963         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15964                 ASSERT(pred->dtp_difo != NULL);
15965                 dtrace_difo_hold(pred->dtp_difo);
15966                 helper->dtha_predicate = pred->dtp_difo;
15967         }
15968 
15969         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15970                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15971                         goto err;
15972 
15973                 if (act->dtad_difo == NULL)
15974                         goto err;
15975 
15976                 nactions++;
15977         }
15978 
15979         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15980             (helper->dtha_nactions = nactions), KM_SLEEP);
15981 
15982         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15983                 dtrace_difo_hold(act->dtad_difo);
15984                 helper->dtha_actions[i++] = act->dtad_difo;
15985         }
15986 
15987         if (!dtrace_helper_validate(helper))
15988                 goto err;
15989 
15990         if (last == NULL) {
15991                 help->dthps_actions[which] = helper;
15992         } else {
15993                 last->dtha_next = helper;
15994         }
15995 
15996         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15997                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15998                 dtrace_helptrace_next = 0;
15999         }
16000 
16001         return (0);
16002 err:
16003         dtrace_helper_action_destroy(helper, vstate);
16004         return (EINVAL);
16005 }
16006 
16007 static void
16008 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
16009     dof_helper_t *dofhp)
16010 {
16011         ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
16012 
16013         mutex_enter(&dtrace_meta_lock);
16014         mutex_enter(&dtrace_lock);
16015 
16016         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
16017                 /*
16018                  * If the dtrace module is loaded but not attached, or if
16019                  * there aren't isn't a meta provider registered to deal with
16020                  * these provider descriptions, we need to postpone creating
16021                  * the actual providers until later.
16022                  */
16023 
16024                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
16025                     dtrace_deferred_pid != help) {
16026                         help->dthps_deferred = 1;
16027                         help->dthps_pid = p->p_pid;
16028                         help->dthps_next = dtrace_deferred_pid;
16029                         help->dthps_prev = NULL;
16030                         if (dtrace_deferred_pid != NULL)
16031                                 dtrace_deferred_pid->dthps_prev = help;
16032                         dtrace_deferred_pid = help;
16033                 }
16034 
16035                 mutex_exit(&dtrace_lock);
16036 
16037         } else if (dofhp != NULL) {
16038                 /*
16039                  * If the dtrace module is loaded and we have a particular
16040                  * helper provider description, pass that off to the
16041                  * meta provider.
16042                  */
16043 
16044                 mutex_exit(&dtrace_lock);
16045 
16046                 dtrace_helper_provide(dofhp, p->p_pid);
16047 
16048         } else {
16049                 /*
16050                  * Otherwise, just pass all the helper provider descriptions
16051                  * off to the meta provider.
16052                  */
16053 
16054                 int i;
16055                 mutex_exit(&dtrace_lock);
16056 
16057                 for (i = 0; i < help->dthps_nprovs; i++) {
16058                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
16059                             p->p_pid);
16060                 }
16061         }
16062 
16063         mutex_exit(&dtrace_meta_lock);
16064 }
16065 
16066 static int
16067 dtrace_helper_provider_add(dof_helper_t *dofhp, dtrace_helpers_t *help, int gen)
16068 {
16069         dtrace_helper_provider_t *hprov, **tmp_provs;
16070         uint_t tmp_maxprovs, i;
16071 
16072         ASSERT(MUTEX_HELD(&dtrace_lock));
16073         ASSERT(help != NULL);
16074 
16075         /*
16076          * If we already have dtrace_helper_providers_max helper providers,
16077          * we're refuse to add a new one.
16078          */
16079         if (help->dthps_nprovs >= dtrace_helper_providers_max)
16080                 return (ENOSPC);
16081 
16082         /*
16083          * Check to make sure this isn't a duplicate.
16084          */
16085         for (i = 0; i < help->dthps_nprovs; i++) {
16086                 if (dofhp->dofhp_addr ==
16087                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
16088                         return (EALREADY);
16089         }
16090 
16091         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
16092         hprov->dthp_prov = *dofhp;
16093         hprov->dthp_ref = 1;
16094         hprov->dthp_generation = gen;
16095 
16096         /*
16097          * Allocate a bigger table for helper providers if it's already full.
16098          */
16099         if (help->dthps_maxprovs == help->dthps_nprovs) {
16100                 tmp_maxprovs = help->dthps_maxprovs;
16101                 tmp_provs = help->dthps_provs;
16102 
16103                 if (help->dthps_maxprovs == 0)
16104                         help->dthps_maxprovs = 2;
16105                 else
16106                         help->dthps_maxprovs *= 2;
16107                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
16108                         help->dthps_maxprovs = dtrace_helper_providers_max;
16109 
16110                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
16111 
16112                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
16113                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16114 
16115                 if (tmp_provs != NULL) {
16116                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
16117                             sizeof (dtrace_helper_provider_t *));
16118                         kmem_free(tmp_provs, tmp_maxprovs *
16119                             sizeof (dtrace_helper_provider_t *));
16120                 }
16121         }
16122 
16123         help->dthps_provs[help->dthps_nprovs] = hprov;
16124         help->dthps_nprovs++;
16125 
16126         return (0);
16127 }
16128 
16129 static void
16130 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
16131 {
16132         mutex_enter(&dtrace_lock);
16133 
16134         if (--hprov->dthp_ref == 0) {
16135                 dof_hdr_t *dof;
16136                 mutex_exit(&dtrace_lock);
16137                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
16138                 dtrace_dof_destroy(dof);
16139                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
16140         } else {
16141                 mutex_exit(&dtrace_lock);
16142         }
16143 }
16144 
16145 static int
16146 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
16147 {
16148         uintptr_t daddr = (uintptr_t)dof;
16149         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
16150         dof_provider_t *provider;
16151         dof_probe_t *probe;
16152         uint8_t *arg;
16153         char *strtab, *typestr;
16154         dof_stridx_t typeidx;
16155         size_t typesz;
16156         uint_t nprobes, j, k;
16157 
16158         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
16159 
16160         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
16161                 dtrace_dof_error(dof, "misaligned section offset");
16162                 return (-1);
16163         }
16164 
16165         /*
16166          * The section needs to be large enough to contain the DOF provider
16167          * structure appropriate for the given version.
16168          */
16169         if (sec->dofs_size <
16170             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
16171             offsetof(dof_provider_t, dofpv_prenoffs) :
16172             sizeof (dof_provider_t))) {
16173                 dtrace_dof_error(dof, "provider section too small");
16174                 return (-1);
16175         }
16176 
16177         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
16178         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
16179         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
16180         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
16181         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
16182 
16183         if (str_sec == NULL || prb_sec == NULL ||
16184             arg_sec == NULL || off_sec == NULL)
16185                 return (-1);
16186 
16187         enoff_sec = NULL;
16188 
16189         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
16190             provider->dofpv_prenoffs != DOF_SECT_NONE &&
16191             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
16192             provider->dofpv_prenoffs)) == NULL)
16193                 return (-1);
16194 
16195         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
16196 
16197         if (provider->dofpv_name >= str_sec->dofs_size ||
16198             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
16199                 dtrace_dof_error(dof, "invalid provider name");
16200                 return (-1);
16201         }
16202 
16203         if (prb_sec->dofs_entsize == 0 ||
16204             prb_sec->dofs_entsize > prb_sec->dofs_size) {
16205                 dtrace_dof_error(dof, "invalid entry size");
16206                 return (-1);
16207         }
16208 
16209         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
16210                 dtrace_dof_error(dof, "misaligned entry size");
16211                 return (-1);
16212         }
16213 
16214         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
16215                 dtrace_dof_error(dof, "invalid entry size");
16216                 return (-1);
16217         }
16218 
16219         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
16220                 dtrace_dof_error(dof, "misaligned section offset");
16221                 return (-1);
16222         }
16223 
16224         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
16225                 dtrace_dof_error(dof, "invalid entry size");
16226                 return (-1);
16227         }
16228 
16229         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
16230 
16231         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
16232 
16233         /*
16234          * Take a pass through the probes to check for errors.
16235          */
16236         for (j = 0; j < nprobes; j++) {
16237                 probe = (dof_probe_t *)(uintptr_t)(daddr +
16238                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
16239 
16240                 if (probe->dofpr_func >= str_sec->dofs_size) {
16241                         dtrace_dof_error(dof, "invalid function name");
16242                         return (-1);
16243                 }
16244 
16245                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
16246                         dtrace_dof_error(dof, "function name too long");
16247                         /*
16248                          * Keep going if the function name is too long.
16249                          * Unlike provider and probe names, we cannot reasonably
16250                          * impose restrictions on function names, since they're
16251                          * a property of the code being instrumented. We will
16252                          * skip this probe in dtrace_helper_provide_one().
16253                          */
16254                 }
16255 
16256                 if (probe->dofpr_name >= str_sec->dofs_size ||
16257                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
16258                         dtrace_dof_error(dof, "invalid probe name");
16259                         return (-1);
16260                 }
16261 
16262                 /*
16263                  * The offset count must not wrap the index, and the offsets
16264                  * must also not overflow the section's data.
16265                  */
16266                 if (probe->dofpr_offidx + probe->dofpr_noffs <
16267                     probe->dofpr_offidx ||
16268                     (probe->dofpr_offidx + probe->dofpr_noffs) *
16269                     off_sec->dofs_entsize > off_sec->dofs_size) {
16270                         dtrace_dof_error(dof, "invalid probe offset");
16271                         return (-1);
16272                 }
16273 
16274                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
16275                         /*
16276                          * If there's no is-enabled offset section, make sure
16277                          * there aren't any is-enabled offsets. Otherwise
16278                          * perform the same checks as for probe offsets
16279                          * (immediately above).
16280                          */
16281                         if (enoff_sec == NULL) {
16282                                 if (probe->dofpr_enoffidx != 0 ||
16283                                     probe->dofpr_nenoffs != 0) {
16284                                         dtrace_dof_error(dof, "is-enabled "
16285                                             "offsets with null section");
16286                                         return (-1);
16287                                 }
16288                         } else if (probe->dofpr_enoffidx +
16289                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
16290                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
16291                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
16292                                 dtrace_dof_error(dof, "invalid is-enabled "
16293                                     "offset");
16294                                 return (-1);
16295                         }
16296 
16297                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
16298                                 dtrace_dof_error(dof, "zero probe and "
16299                                     "is-enabled offsets");
16300                                 return (-1);
16301                         }
16302                 } else if (probe->dofpr_noffs == 0) {
16303                         dtrace_dof_error(dof, "zero probe offsets");
16304                         return (-1);
16305                 }
16306 
16307                 if (probe->dofpr_argidx + probe->dofpr_xargc <
16308                     probe->dofpr_argidx ||
16309                     (probe->dofpr_argidx + probe->dofpr_xargc) *
16310                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
16311                         dtrace_dof_error(dof, "invalid args");
16312                         return (-1);
16313                 }
16314 
16315                 typeidx = probe->dofpr_nargv;
16316                 typestr = strtab + probe->dofpr_nargv;
16317                 for (k = 0; k < probe->dofpr_nargc; k++) {
16318                         if (typeidx >= str_sec->dofs_size) {
16319                                 dtrace_dof_error(dof, "bad "
16320                                     "native argument type");
16321                                 return (-1);
16322                         }
16323 
16324                         typesz = strlen(typestr) + 1;
16325                         if (typesz > DTRACE_ARGTYPELEN) {
16326                                 dtrace_dof_error(dof, "native "
16327                                     "argument type too long");
16328                                 return (-1);
16329                         }
16330                         typeidx += typesz;
16331                         typestr += typesz;
16332                 }
16333 
16334                 typeidx = probe->dofpr_xargv;
16335                 typestr = strtab + probe->dofpr_xargv;
16336                 for (k = 0; k < probe->dofpr_xargc; k++) {
16337                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
16338                                 dtrace_dof_error(dof, "bad "
16339                                     "native argument index");
16340                                 return (-1);
16341                         }
16342 
16343                         if (typeidx >= str_sec->dofs_size) {
16344                                 dtrace_dof_error(dof, "bad "
16345                                     "translated argument type");
16346                                 return (-1);
16347                         }
16348 
16349                         typesz = strlen(typestr) + 1;
16350                         if (typesz > DTRACE_ARGTYPELEN) {
16351                                 dtrace_dof_error(dof, "translated argument "
16352                                     "type too long");
16353                                 return (-1);
16354                         }
16355 
16356                         typeidx += typesz;
16357                         typestr += typesz;
16358                 }
16359         }
16360 
16361         return (0);
16362 }
16363 
16364 static int
16365 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp, struct proc *p)
16366 {
16367         dtrace_helpers_t *help;
16368         dtrace_vstate_t *vstate;
16369         dtrace_enabling_t *enab = NULL;
16370         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
16371         uintptr_t daddr = (uintptr_t)dof;
16372 
16373         ASSERT(MUTEX_HELD(&dtrace_lock));
16374 
16375         if ((help = p->p_dtrace_helpers) == NULL)
16376                 help = dtrace_helpers_create(p);
16377 
16378         vstate = &help->dthps_vstate;
16379 
16380         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab, dhp->dofhp_addr,
16381             dhp->dofhp_dof, B_FALSE)) != 0) {
16382                 dtrace_dof_destroy(dof);
16383                 return (rv);
16384         }
16385 
16386         /*
16387          * Look for helper providers and validate their descriptions.
16388          */
16389         for (i = 0; i < dof->dofh_secnum; i++) {
16390                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
16391                     dof->dofh_secoff + i * dof->dofh_secsize);
16392 
16393                 if (sec->dofs_type != DOF_SECT_PROVIDER)
16394                         continue;
16395 
16396                 if (dtrace_helper_provider_validate(dof, sec) != 0) {
16397                         dtrace_enabling_destroy(enab);
16398                         dtrace_dof_destroy(dof);
16399                         return (-1);
16400                 }
16401 
16402                 nprovs++;
16403         }
16404 
16405         /*
16406          * Now we need to walk through the ECB descriptions in the enabling.
16407          */
16408         for (i = 0; i < enab->dten_ndesc; i++) {
16409                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
16410                 dtrace_probedesc_t *desc = &ep->dted_probe;
16411 
16412                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
16413                         continue;
16414 
16415                 if (strcmp(desc->dtpd_mod, "helper") != 0)
16416                         continue;
16417 
16418                 if (strcmp(desc->dtpd_func, "ustack") != 0)
16419                         continue;
16420 
16421                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
16422                     ep, help)) != 0) {
16423                         /*
16424                          * Adding this helper action failed -- we are now going
16425                          * to rip out the entire generation and return failure.
16426                          */
16427                         (void) dtrace_helper_destroygen(help,
16428                             help->dthps_generation);
16429                         dtrace_enabling_destroy(enab);
16430                         dtrace_dof_destroy(dof);
16431                         return (-1);
16432                 }
16433 
16434                 nhelpers++;
16435         }
16436 
16437         if (nhelpers < enab->dten_ndesc)
16438                 dtrace_dof_error(dof, "unmatched helpers");
16439 
16440         gen = help->dthps_generation++;
16441         dtrace_enabling_destroy(enab);
16442 
16443         if (nprovs > 0) {
16444                 /*
16445                  * Now that this is in-kernel, we change the sense of the
16446                  * members:  dofhp_dof denotes the in-kernel copy of the DOF
16447                  * and dofhp_addr denotes the address at user-level.
16448                  */
16449                 dhp->dofhp_addr = dhp->dofhp_dof;
16450                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
16451 
16452                 if (dtrace_helper_provider_add(dhp, help, gen) == 0) {
16453                         mutex_exit(&dtrace_lock);
16454                         dtrace_helper_provider_register(p, help, dhp);
16455                         mutex_enter(&dtrace_lock);
16456 
16457                         destroy = 0;
16458                 }
16459         }
16460 
16461         if (destroy)
16462                 dtrace_dof_destroy(dof);
16463 
16464         return (gen);
16465 }
16466 
16467 static dtrace_helpers_t *
16468 dtrace_helpers_create(proc_t *p)
16469 {
16470         dtrace_helpers_t *help;
16471 
16472         ASSERT(MUTEX_HELD(&dtrace_lock));
16473         ASSERT(p->p_dtrace_helpers == NULL);
16474 
16475         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16476         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16477             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16478 
16479         p->p_dtrace_helpers = help;
16480         dtrace_helpers++;
16481 
16482         return (help);
16483 }
16484 
16485 #ifdef illumos
16486 static
16487 #endif
16488 void
16489 dtrace_helpers_destroy(proc_t *p)
16490 {
16491         dtrace_helpers_t *help;
16492         dtrace_vstate_t *vstate;
16493 #ifdef illumos
16494         proc_t *p = curproc;
16495 #endif
16496         int i;
16497 
16498         mutex_enter(&dtrace_lock);
16499 
16500         ASSERT(p->p_dtrace_helpers != NULL);
16501         ASSERT(dtrace_helpers > 0);
16502 
16503         help = p->p_dtrace_helpers;
16504         vstate = &help->dthps_vstate;
16505 
16506         /*
16507          * We're now going to lose the help from this process.
16508          */
16509         p->p_dtrace_helpers = NULL;
16510         dtrace_sync();
16511 
16512         /*
16513          * Destory the helper actions.
16514          */
16515         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16516                 dtrace_helper_action_t *h, *next;
16517 
16518                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16519                         next = h->dtha_next;
16520                         dtrace_helper_action_destroy(h, vstate);
16521                         h = next;
16522                 }
16523         }
16524 
16525         mutex_exit(&dtrace_lock);
16526 
16527         /*
16528          * Destroy the helper providers.
16529          */
16530         if (help->dthps_maxprovs > 0) {
16531                 mutex_enter(&dtrace_meta_lock);
16532                 if (dtrace_meta_pid != NULL) {
16533                         ASSERT(dtrace_deferred_pid == NULL);
16534 
16535                         for (i = 0; i < help->dthps_nprovs; i++) {
16536                                 dtrace_helper_provider_remove(
16537                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
16538                         }
16539                 } else {
16540                         mutex_enter(&dtrace_lock);
16541                         ASSERT(help->dthps_deferred == 0 ||
16542                             help->dthps_next != NULL ||
16543                             help->dthps_prev != NULL ||
16544                             help == dtrace_deferred_pid);
16545 
16546                         /*
16547                          * Remove the helper from the deferred list.
16548                          */
16549                         if (help->dthps_next != NULL)
16550                                 help->dthps_next->dthps_prev = help->dthps_prev;
16551                         if (help->dthps_prev != NULL)
16552                                 help->dthps_prev->dthps_next = help->dthps_next;
16553                         if (dtrace_deferred_pid == help) {
16554                                 dtrace_deferred_pid = help->dthps_next;
16555                                 ASSERT(help->dthps_prev == NULL);
16556                         }
16557 
16558                         mutex_exit(&dtrace_lock);
16559                 }
16560 
16561                 mutex_exit(&dtrace_meta_lock);
16562 
16563                 for (i = 0; i < help->dthps_nprovs; i++) {
16564                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16565                 }
16566 
16567                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16568                     sizeof (dtrace_helper_provider_t *));
16569         }
16570 
16571         mutex_enter(&dtrace_lock);
16572 
16573         dtrace_vstate_fini(&help->dthps_vstate);
16574         kmem_free(help->dthps_actions,
16575             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16576         kmem_free(help, sizeof (dtrace_helpers_t));
16577 
16578         --dtrace_helpers;
16579         mutex_exit(&dtrace_lock);
16580 }
16581 
16582 #ifdef illumos
16583 static
16584 #endif
16585 void
16586 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16587 {
16588         dtrace_helpers_t *help, *newhelp;
16589         dtrace_helper_action_t *helper, *new, *last;
16590         dtrace_difo_t *dp;
16591         dtrace_vstate_t *vstate;
16592         int i, j, sz, hasprovs = 0;
16593 
16594         mutex_enter(&dtrace_lock);
16595         ASSERT(from->p_dtrace_helpers != NULL);
16596         ASSERT(dtrace_helpers > 0);
16597 
16598         help = from->p_dtrace_helpers;
16599         newhelp = dtrace_helpers_create(to);
16600         ASSERT(to->p_dtrace_helpers != NULL);
16601 
16602         newhelp->dthps_generation = help->dthps_generation;
16603         vstate = &newhelp->dthps_vstate;
16604 
16605         /*
16606          * Duplicate the helper actions.
16607          */
16608         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16609                 if ((helper = help->dthps_actions[i]) == NULL)
16610                         continue;
16611 
16612                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16613                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16614                             KM_SLEEP);
16615                         new->dtha_generation = helper->dtha_generation;
16616 
16617                         if ((dp = helper->dtha_predicate) != NULL) {
16618                                 dp = dtrace_difo_duplicate(dp, vstate);
16619                                 new->dtha_predicate = dp;
16620                         }
16621 
16622                         new->dtha_nactions = helper->dtha_nactions;
16623                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16624                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16625 
16626                         for (j = 0; j < new->dtha_nactions; j++) {
16627                                 dtrace_difo_t *dp = helper->dtha_actions[j];
16628 
16629                                 ASSERT(dp != NULL);
16630                                 dp = dtrace_difo_duplicate(dp, vstate);
16631                                 new->dtha_actions[j] = dp;
16632                         }
16633 
16634                         if (last != NULL) {
16635                                 last->dtha_next = new;
16636                         } else {
16637                                 newhelp->dthps_actions[i] = new;
16638                         }
16639 
16640                         last = new;
16641                 }
16642         }
16643 
16644         /*
16645          * Duplicate the helper providers and register them with the
16646          * DTrace framework.
16647          */
16648         if (help->dthps_nprovs > 0) {
16649                 newhelp->dthps_nprovs = help->dthps_nprovs;
16650                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16651                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16652                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16653                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16654                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16655                         newhelp->dthps_provs[i]->dthp_ref++;
16656                 }
16657 
16658                 hasprovs = 1;
16659         }
16660 
16661         mutex_exit(&dtrace_lock);
16662 
16663         if (hasprovs)
16664                 dtrace_helper_provider_register(to, newhelp, NULL);
16665 }
16666 
16667 /*
16668  * DTrace Hook Functions
16669  */
16670 static void
16671 dtrace_module_loaded(modctl_t *ctl)
16672 {
16673         dtrace_provider_t *prv;
16674 
16675         mutex_enter(&dtrace_provider_lock);
16676 #ifdef illumos
16677         mutex_enter(&mod_lock);
16678 #endif
16679 
16680 #ifdef illumos
16681         ASSERT(ctl->mod_busy);
16682 #endif
16683 
16684         /*
16685          * We're going to call each providers per-module provide operation
16686          * specifying only this module.
16687          */
16688         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16689                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16690 
16691 #ifdef illumos
16692         mutex_exit(&mod_lock);
16693 #endif
16694         mutex_exit(&dtrace_provider_lock);
16695 
16696         /*
16697          * If we have any retained enablings, we need to match against them.
16698          * Enabling probes requires that cpu_lock be held, and we cannot hold
16699          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16700          * module.  (In particular, this happens when loading scheduling
16701          * classes.)  So if we have any retained enablings, we need to dispatch
16702          * our task queue to do the match for us.
16703          */
16704         mutex_enter(&dtrace_lock);
16705 
16706         if (dtrace_retained == NULL) {
16707                 mutex_exit(&dtrace_lock);
16708                 return;
16709         }
16710 
16711         (void) taskq_dispatch(dtrace_taskq,
16712             (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16713 
16714         mutex_exit(&dtrace_lock);
16715 
16716         /*
16717          * And now, for a little heuristic sleaze:  in general, we want to
16718          * match modules as soon as they load.  However, we cannot guarantee
16719          * this, because it would lead us to the lock ordering violation
16720          * outlined above.  The common case, of course, is that cpu_lock is
16721          * _not_ held -- so we delay here for a clock tick, hoping that that's
16722          * long enough for the task queue to do its work.  If it's not, it's
16723          * not a serious problem -- it just means that the module that we
16724          * just loaded may not be immediately instrumentable.
16725          */
16726         delay(1);
16727 }
16728 
16729 static void
16730 #ifdef illumos
16731 dtrace_module_unloaded(modctl_t *ctl)
16732 #else
16733 dtrace_module_unloaded(modctl_t *ctl, int *error)
16734 #endif
16735 {
16736         dtrace_probe_t template, *probe, *first, *next;
16737         dtrace_provider_t *prov;
16738 #ifndef illumos
16739         char modname[DTRACE_MODNAMELEN];
16740         size_t len;
16741 #endif
16742 
16743 #ifdef illumos
16744         template.dtpr_mod = ctl->mod_modname;
16745 #else
16746         /* Handle the fact that ctl->filename may end in ".ko". */
16747         strlcpy(modname, ctl->filename, sizeof(modname));
16748         len = strlen(ctl->filename);
16749         if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16750                 modname[len - 3] = '\0';
16751         template.dtpr_mod = modname;
16752 #endif
16753 
16754         mutex_enter(&dtrace_provider_lock);
16755 #ifdef illumos
16756         mutex_enter(&mod_lock);
16757 #endif
16758         mutex_enter(&dtrace_lock);
16759 
16760 #ifndef illumos
16761         if (ctl->nenabled > 0) {
16762                 /* Don't allow unloads if a probe is enabled. */
16763                 mutex_exit(&dtrace_provider_lock);
16764                 mutex_exit(&dtrace_lock);
16765                 *error = -1;
16766                 printf(
16767         "kldunload: attempt to unload module that has DTrace probes enabled\n");
16768                 return;
16769         }
16770 #endif
16771 
16772         if (dtrace_bymod == NULL) {
16773                 /*
16774                  * The DTrace module is loaded (obviously) but not attached;
16775                  * we don't have any work to do.
16776                  */
16777                 mutex_exit(&dtrace_provider_lock);
16778 #ifdef illumos
16779                 mutex_exit(&mod_lock);
16780 #endif
16781                 mutex_exit(&dtrace_lock);
16782                 return;
16783         }
16784 
16785         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16786             probe != NULL; probe = probe->dtpr_nextmod) {
16787                 if (probe->dtpr_ecb != NULL) {
16788                         mutex_exit(&dtrace_provider_lock);
16789 #ifdef illumos
16790                         mutex_exit(&mod_lock);
16791 #endif
16792                         mutex_exit(&dtrace_lock);
16793 
16794                         /*
16795                          * This shouldn't _actually_ be possible -- we're
16796                          * unloading a module that has an enabled probe in it.
16797                          * (It's normally up to the provider to make sure that
16798                          * this can't happen.)  However, because dtps_enable()
16799                          * doesn't have a failure mode, there can be an
16800                          * enable/unload race.  Upshot:  we don't want to
16801                          * assert, but we're not going to disable the
16802                          * probe, either.
16803                          */
16804                         if (dtrace_err_verbose) {
16805 #ifdef illumos
16806                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16807                                     "enabled probes", ctl->mod_modname);
16808 #else
16809                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16810                                     "enabled probes", modname);
16811 #endif
16812                         }
16813 
16814                         return;
16815                 }
16816         }
16817 
16818         probe = first;
16819 
16820         for (first = NULL; probe != NULL; probe = next) {
16821                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16822 
16823                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16824 
16825                 next = probe->dtpr_nextmod;
16826                 dtrace_hash_remove(dtrace_bymod, probe);
16827                 dtrace_hash_remove(dtrace_byfunc, probe);
16828                 dtrace_hash_remove(dtrace_byname, probe);
16829 
16830                 if (first == NULL) {
16831                         first = probe;
16832                         probe->dtpr_nextmod = NULL;
16833                 } else {
16834                         probe->dtpr_nextmod = first;
16835                         first = probe;
16836                 }
16837         }
16838 
16839         /*
16840          * We've removed all of the module's probes from the hash chains and
16841          * from the probe array.  Now issue a dtrace_sync() to be sure that
16842          * everyone has cleared out from any probe array processing.
16843          */
16844         dtrace_sync();
16845 
16846         for (probe = first; probe != NULL; probe = first) {
16847                 first = probe->dtpr_nextmod;
16848                 prov = probe->dtpr_provider;
16849                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16850                     probe->dtpr_arg);
16851                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16852                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16853                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16854 #ifdef illumos
16855                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16856 #else
16857                 free_unr(dtrace_arena, probe->dtpr_id);
16858 #endif
16859                 kmem_free(probe, sizeof (dtrace_probe_t));
16860         }
16861 
16862         mutex_exit(&dtrace_lock);
16863 #ifdef illumos
16864         mutex_exit(&mod_lock);
16865 #endif
16866         mutex_exit(&dtrace_provider_lock);
16867 }
16868 
16869 #ifndef illumos
16870 static void
16871 dtrace_kld_load(void *arg __unused, linker_file_t lf)
16872 {
16873 
16874         dtrace_module_loaded(lf);
16875 }
16876 
16877 static void
16878 dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16879 {
16880 
16881         if (*error != 0)
16882                 /* We already have an error, so don't do anything. */
16883                 return;
16884         dtrace_module_unloaded(lf, error);
16885 }
16886 #endif
16887 
16888 #ifdef illumos
16889 static void
16890 dtrace_suspend(void)
16891 {
16892         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16893 }
16894 
16895 static void
16896 dtrace_resume(void)
16897 {
16898         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16899 }
16900 #endif
16901 
16902 static int
16903 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16904 {
16905         ASSERT(MUTEX_HELD(&cpu_lock));
16906         mutex_enter(&dtrace_lock);
16907 
16908         switch (what) {
16909         case CPU_CONFIG: {
16910                 dtrace_state_t *state;
16911                 dtrace_optval_t *opt, rs, c;
16912 
16913                 /*
16914                  * For now, we only allocate a new buffer for anonymous state.
16915                  */
16916                 if ((state = dtrace_anon.dta_state) == NULL)
16917                         break;
16918 
16919                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16920                         break;
16921 
16922                 opt = state->dts_options;
16923                 c = opt[DTRACEOPT_CPU];
16924 
16925                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16926                         break;
16927 
16928                 /*
16929                  * Regardless of what the actual policy is, we're going to
16930                  * temporarily set our resize policy to be manual.  We're
16931                  * also going to temporarily set our CPU option to denote
16932                  * the newly configured CPU.
16933                  */
16934                 rs = opt[DTRACEOPT_BUFRESIZE];
16935                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16936                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16937 
16938                 (void) dtrace_state_buffers(state);
16939 
16940                 opt[DTRACEOPT_BUFRESIZE] = rs;
16941                 opt[DTRACEOPT_CPU] = c;
16942 
16943                 break;
16944         }
16945 
16946         case CPU_UNCONFIG:
16947                 /*
16948                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
16949                  * buffer will be freed when the consumer exits.)
16950                  */
16951                 break;
16952 
16953         default:
16954                 break;
16955         }
16956 
16957         mutex_exit(&dtrace_lock);
16958         return (0);
16959 }
16960 
16961 #ifdef illumos
16962 static void
16963 dtrace_cpu_setup_initial(processorid_t cpu)
16964 {
16965         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16966 }
16967 #endif
16968 
16969 static void
16970 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16971 {
16972         if (dtrace_toxranges >= dtrace_toxranges_max) {
16973                 int osize, nsize;
16974                 dtrace_toxrange_t *range;
16975 
16976                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16977 
16978                 if (osize == 0) {
16979                         ASSERT(dtrace_toxrange == NULL);
16980                         ASSERT(dtrace_toxranges_max == 0);
16981                         dtrace_toxranges_max = 1;
16982                 } else {
16983                         dtrace_toxranges_max <<= 1;
16984                 }
16985 
16986                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16987                 range = kmem_zalloc(nsize, KM_SLEEP);
16988 
16989                 if (dtrace_toxrange != NULL) {
16990                         ASSERT(osize != 0);
16991                         bcopy(dtrace_toxrange, range, osize);
16992                         kmem_free(dtrace_toxrange, osize);
16993                 }
16994 
16995                 dtrace_toxrange = range;
16996         }
16997 
16998         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16999         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17000 
17001         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17002         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17003         dtrace_toxranges++;
17004 }
17005 
17006 static void
17007 dtrace_getf_barrier(void)
17008 {
17009 #ifdef illumos
17010         /*
17011          * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
17012          * that contain calls to getf(), this routine will be called on every
17013          * closef() before either the underlying vnode is released or the
17014          * file_t itself is freed.  By the time we are here, it is essential
17015          * that the file_t can no longer be accessed from a call to getf()
17016          * in probe context -- that assures that a dtrace_sync() can be used
17017          * to clear out any enablings referring to the old structures.
17018          */
17019         if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
17020             kcred->cr_zone->zone_dtrace_getf != 0)
17021                 dtrace_sync();
17022 #endif
17023 }
17024 
17025 /*
17026  * DTrace Driver Cookbook Functions
17027  */
17028 #ifdef illumos
17029 /*ARGSUSED*/
17030 static int
17031 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
17032 {
17033         dtrace_provider_id_t id;
17034         dtrace_state_t *state = NULL;
17035         dtrace_enabling_t *enab;
17036 
17037         mutex_enter(&cpu_lock);
17038         mutex_enter(&dtrace_provider_lock);
17039         mutex_enter(&dtrace_lock);
17040 
17041         if (ddi_soft_state_init(&dtrace_softstate,
17042             sizeof (dtrace_state_t), 0) != 0) {
17043                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
17044                 mutex_exit(&cpu_lock);
17045                 mutex_exit(&dtrace_provider_lock);
17046                 mutex_exit(&dtrace_lock);
17047                 return (DDI_FAILURE);
17048         }
17049 
17050         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
17051             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
17052             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
17053             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
17054                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
17055                 ddi_remove_minor_node(devi, NULL);
17056                 ddi_soft_state_fini(&dtrace_softstate);
17057                 mutex_exit(&cpu_lock);
17058                 mutex_exit(&dtrace_provider_lock);
17059                 mutex_exit(&dtrace_lock);
17060                 return (DDI_FAILURE);
17061         }
17062 
17063         ddi_report_dev(devi);
17064         dtrace_devi = devi;
17065 
17066         dtrace_modload = dtrace_module_loaded;
17067         dtrace_modunload = dtrace_module_unloaded;
17068         dtrace_cpu_init = dtrace_cpu_setup_initial;
17069         dtrace_helpers_cleanup = dtrace_helpers_destroy;
17070         dtrace_helpers_fork = dtrace_helpers_duplicate;
17071         dtrace_cpustart_init = dtrace_suspend;
17072         dtrace_cpustart_fini = dtrace_resume;
17073         dtrace_debugger_init = dtrace_suspend;
17074         dtrace_debugger_fini = dtrace_resume;
17075 
17076         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17077 
17078         ASSERT(MUTEX_HELD(&cpu_lock));
17079 
17080         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
17081             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17082         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
17083             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
17084             VM_SLEEP | VMC_IDENTIFIER);
17085         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
17086             1, INT_MAX, 0);
17087 
17088         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17089             sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
17090             NULL, NULL, NULL, NULL, NULL, 0);
17091 
17092         ASSERT(MUTEX_HELD(&cpu_lock));
17093         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
17094             offsetof(dtrace_probe_t, dtpr_nextmod),
17095             offsetof(dtrace_probe_t, dtpr_prevmod));
17096 
17097         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
17098             offsetof(dtrace_probe_t, dtpr_nextfunc),
17099             offsetof(dtrace_probe_t, dtpr_prevfunc));
17100 
17101         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
17102             offsetof(dtrace_probe_t, dtpr_nextname),
17103             offsetof(dtrace_probe_t, dtpr_prevname));
17104 
17105         if (dtrace_retain_max < 1) {
17106                 cmn_err(CE_WARN, "illegal value (%zu) for dtrace_retain_max; "
17107                     "setting to 1", dtrace_retain_max);
17108                 dtrace_retain_max = 1;
17109         }
17110 
17111         /*
17112          * Now discover our toxic ranges.
17113          */
17114         dtrace_toxic_ranges(dtrace_toxrange_add);
17115 
17116         /*
17117          * Before we register ourselves as a provider to our own framework,
17118          * we would like to assert that dtrace_provider is NULL -- but that's
17119          * not true if we were loaded as a dependency of a DTrace provider.
17120          * Once we've registered, we can assert that dtrace_provider is our
17121          * pseudo provider.
17122          */
17123         (void) dtrace_register("dtrace", &dtrace_provider_attr,
17124             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17125 
17126         ASSERT(dtrace_provider != NULL);
17127         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17128 
17129         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17130             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
17131         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17132             dtrace_provider, NULL, NULL, "END", 0, NULL);
17133         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17134             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
17135 
17136         dtrace_anon_property();
17137         mutex_exit(&cpu_lock);
17138 
17139         /*
17140          * If there are already providers, we must ask them to provide their
17141          * probes, and then match any anonymous enabling against them.  Note
17142          * that there should be no other retained enablings at this time:
17143          * the only retained enablings at this time should be the anonymous
17144          * enabling.
17145          */
17146         if (dtrace_anon.dta_enabling != NULL) {
17147                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17148 
17149                 dtrace_enabling_provide(NULL);
17150                 state = dtrace_anon.dta_state;
17151 
17152                 /*
17153                  * We couldn't hold cpu_lock across the above call to
17154                  * dtrace_enabling_provide(), but we must hold it to actually
17155                  * enable the probes.  We have to drop all of our locks, pick
17156                  * up cpu_lock, and regain our locks before matching the
17157                  * retained anonymous enabling.
17158                  */
17159                 mutex_exit(&dtrace_lock);
17160                 mutex_exit(&dtrace_provider_lock);
17161 
17162                 mutex_enter(&cpu_lock);
17163                 mutex_enter(&dtrace_provider_lock);
17164                 mutex_enter(&dtrace_lock);
17165 
17166                 if ((enab = dtrace_anon.dta_enabling) != NULL)
17167                         (void) dtrace_enabling_match(enab, NULL);
17168 
17169                 mutex_exit(&cpu_lock);
17170         }
17171 
17172         mutex_exit(&dtrace_lock);
17173         mutex_exit(&dtrace_provider_lock);
17174 
17175         if (state != NULL) {
17176                 /*
17177                  * If we created any anonymous state, set it going now.
17178                  */
17179                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17180         }
17181 
17182         return (DDI_SUCCESS);
17183 }
17184 #endif  /* illumos */
17185 
17186 #ifndef illumos
17187 static void dtrace_dtr(void *);
17188 #endif
17189 
17190 /*ARGSUSED*/
17191 static int
17192 #ifdef illumos
17193 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17194 #else
17195 dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
17196 #endif
17197 {
17198         dtrace_state_t *state;
17199         uint32_t priv;
17200         uid_t uid;
17201         zoneid_t zoneid;
17202 
17203 #ifdef illumos
17204         if (getminor(*devp) == DTRACEMNRN_HELPER)
17205                 return (0);
17206 
17207         /*
17208          * If this wasn't an open with the "helper" minor, then it must be
17209          * the "dtrace" minor.
17210          */
17211         if (getminor(*devp) == DTRACEMNRN_DTRACE)
17212                 return (ENXIO);
17213 #else
17214         cred_t *cred_p = NULL;
17215         cred_p = dev->si_cred;
17216 
17217         /*
17218          * If no DTRACE_PRIV_* bits are set in the credential, then the
17219          * caller lacks sufficient permission to do anything with DTrace.
17220          */
17221         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17222         if (priv == DTRACE_PRIV_NONE) {
17223 #endif
17224 
17225                 return (EACCES);
17226         }
17227 
17228         /*
17229          * Ask all providers to provide all their probes.
17230          */
17231         mutex_enter(&dtrace_provider_lock);
17232         dtrace_probe_provide(NULL, NULL);
17233         mutex_exit(&dtrace_provider_lock);
17234 
17235         mutex_enter(&cpu_lock);
17236         mutex_enter(&dtrace_lock);
17237         dtrace_opens++;
17238         dtrace_membar_producer();
17239 
17240 #ifdef illumos
17241         /*
17242          * If the kernel debugger is active (that is, if the kernel debugger
17243          * modified text in some way), we won't allow the open.
17244          */
17245         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17246                 dtrace_opens--;
17247                 mutex_exit(&cpu_lock);
17248                 mutex_exit(&dtrace_lock);
17249                 return (EBUSY);
17250         }
17251 
17252         if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
17253                 /*
17254                  * If DTrace helper tracing is enabled, we need to allocate the
17255                  * trace buffer and initialize the values.
17256                  */
17257                 dtrace_helptrace_buffer =
17258                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17259                 dtrace_helptrace_next = 0;
17260                 dtrace_helptrace_wrapped = 0;
17261                 dtrace_helptrace_enable = 0;
17262         }
17263 
17264         state = dtrace_state_create(devp, cred_p);
17265 #else
17266         state = dtrace_state_create(dev, NULL);
17267         devfs_set_cdevpriv(state, dtrace_dtr);
17268 #endif
17269 
17270         mutex_exit(&cpu_lock);
17271 
17272         if (state == NULL) {
17273 #ifdef illumos
17274                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17275                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17276 #else
17277                 --dtrace_opens;
17278 #endif
17279                 mutex_exit(&dtrace_lock);
17280                 return (EAGAIN);
17281         }
17282 
17283         mutex_exit(&dtrace_lock);
17284 
17285         return (0);
17286 }
17287 
17288 /*ARGSUSED*/
17289 #ifdef illumos
17290 static int
17291 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17292 #else
17293 static void
17294 dtrace_dtr(void *data)
17295 #endif
17296 {
17297 #ifdef illumos
17298         minor_t minor = getminor(dev);
17299         dtrace_state_t *state;
17300 #endif
17301         dtrace_helptrace_t *buf = NULL;
17302 
17303 #ifdef illumos
17304         if (minor == DTRACEMNRN_HELPER)
17305                 return (0);
17306 
17307         state = ddi_get_soft_state(dtrace_softstate, minor);
17308 #else
17309         dtrace_state_t *state = data;
17310 #endif
17311 
17312         mutex_enter(&cpu_lock);
17313         mutex_enter(&dtrace_lock);
17314 
17315 #ifdef illumos
17316         if (state->dts_anon)
17317 #else
17318         if (state != NULL && state->dts_anon)
17319 #endif
17320         {
17321                 /*
17322                  * There is anonymous state. Destroy that first.
17323                  */
17324                 ASSERT(dtrace_anon.dta_state == NULL);
17325                 dtrace_state_destroy(state->dts_anon);
17326         }
17327 
17328         if (dtrace_helptrace_disable) {
17329                 /*
17330                  * If we have been told to disable helper tracing, set the
17331                  * buffer to NULL before calling into dtrace_state_destroy();
17332                  * we take advantage of its dtrace_sync() to know that no
17333                  * CPU is in probe context with enabled helper tracing
17334                  * after it returns.
17335                  */
17336                 buf = dtrace_helptrace_buffer;
17337                 dtrace_helptrace_buffer = NULL;
17338         }
17339 
17340 #ifdef illumos
17341         dtrace_state_destroy(state);
17342 #else
17343         if (state != NULL) {
17344                 dtrace_state_destroy(state);
17345                 kmem_free(state, 0);
17346         }
17347 #endif
17348         ASSERT(dtrace_opens > 0);
17349 
17350 #ifdef illumos
17351         /*
17352          * Only relinquish control of the kernel debugger interface when there
17353          * are no consumers and no anonymous enablings.
17354          */
17355         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17356                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17357 #else
17358         --dtrace_opens;
17359 #endif
17360 
17361         if (buf != NULL) {
17362                 kmem_free(buf, dtrace_helptrace_bufsize);
17363                 dtrace_helptrace_disable = 0;
17364         }
17365 
17366         mutex_exit(&dtrace_lock);
17367         mutex_exit(&cpu_lock);
17368 
17369 #ifdef illumos
17370         return (0);
17371 #endif
17372 }
17373 
17374 #ifdef illumos
17375 /*ARGSUSED*/
17376 static int
17377 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
17378 {
17379         int rval;
17380         dof_helper_t help, *dhp = NULL;
17381 
17382         switch (cmd) {
17383         case DTRACEHIOC_ADDDOF:
17384                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
17385                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
17386                         return (EFAULT);
17387                 }
17388 
17389                 dhp = &help;
17390                 arg = (intptr_t)help.dofhp_dof;
17391                 /*FALLTHROUGH*/
17392 
17393         case DTRACEHIOC_ADD: {
17394                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
17395 
17396                 if (dof == NULL)
17397                         return (rval);
17398 
17399                 mutex_enter(&dtrace_lock);
17400 
17401                 /*
17402                  * dtrace_helper_slurp() takes responsibility for the dof --
17403                  * it may free it now or it may save it and free it later.
17404                  */
17405                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
17406                         *rv = rval;
17407                         rval = 0;
17408                 } else {
17409                         rval = EINVAL;
17410                 }
17411 
17412                 mutex_exit(&dtrace_lock);
17413                 return (rval);
17414         }
17415 
17416         case DTRACEHIOC_REMOVE: {
17417                 mutex_enter(&dtrace_lock);
17418                 rval = dtrace_helper_destroygen(NULL, arg);
17419                 mutex_exit(&dtrace_lock);
17420 
17421                 return (rval);
17422         }
17423 
17424         default:
17425                 break;
17426         }
17427 
17428         return (ENOTTY);
17429 }
17430 
17431 /*ARGSUSED*/
17432 static int
17433 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
17434 {
17435         minor_t minor = getminor(dev);
17436         dtrace_state_t *state;
17437         int rval;
17438 
17439         if (minor == DTRACEMNRN_HELPER)
17440                 return (dtrace_ioctl_helper(cmd, arg, rv));
17441 
17442         state = ddi_get_soft_state(dtrace_softstate, minor);
17443 
17444         if (state->dts_anon) {
17445                 ASSERT(dtrace_anon.dta_state == NULL);
17446                 state = state->dts_anon;
17447         }
17448 
17449         switch (cmd) {
17450         case DTRACEIOC_PROVIDER: {
17451                 dtrace_providerdesc_t pvd;
17452                 dtrace_provider_t *pvp;
17453 
17454                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17455                         return (EFAULT);
17456 
17457                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17458                 mutex_enter(&dtrace_provider_lock);
17459 
17460                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17461                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17462                                 break;
17463                 }
17464 
17465                 mutex_exit(&dtrace_provider_lock);
17466 
17467                 if (pvp == NULL)
17468                         return (ESRCH);
17469 
17470                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17471                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17472 
17473                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17474                         return (EFAULT);
17475 
17476                 return (0);
17477         }
17478 
17479         case DTRACEIOC_EPROBE: {
17480                 dtrace_eprobedesc_t epdesc;
17481                 dtrace_ecb_t *ecb;
17482                 dtrace_action_t *act;
17483                 void *buf;
17484                 size_t size;
17485                 uintptr_t dest;
17486                 int nrecs;
17487 
17488                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17489                         return (EFAULT);
17490 
17491                 mutex_enter(&dtrace_lock);
17492 
17493                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17494                         mutex_exit(&dtrace_lock);
17495                         return (EINVAL);
17496                 }
17497 
17498                 if (ecb->dte_probe == NULL) {
17499                         mutex_exit(&dtrace_lock);
17500                         return (EINVAL);
17501                 }
17502 
17503                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17504                 epdesc.dtepd_uarg = ecb->dte_uarg;
17505                 epdesc.dtepd_size = ecb->dte_size;
17506 
17507                 nrecs = epdesc.dtepd_nrecs;
17508                 epdesc.dtepd_nrecs = 0;
17509                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17510                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17511                                 continue;
17512 
17513                         epdesc.dtepd_nrecs++;
17514                 }
17515 
17516                 /*
17517                  * Now that we have the size, we need to allocate a temporary
17518                  * buffer in which to store the complete description.  We need
17519                  * the temporary buffer to be able to drop dtrace_lock()
17520                  * across the copyout(), below.
17521                  */
17522                 size = sizeof (dtrace_eprobedesc_t) +
17523                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17524 
17525                 buf = kmem_alloc(size, KM_SLEEP);
17526                 dest = (uintptr_t)buf;
17527 
17528                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17529                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17530 
17531                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17532                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17533                                 continue;
17534 
17535                         if (nrecs-- == 0)
17536                                 break;
17537 
17538                         bcopy(&act->dta_rec, (void *)dest,
17539                             sizeof (dtrace_recdesc_t));
17540                         dest += sizeof (dtrace_recdesc_t);
17541                 }
17542 
17543                 mutex_exit(&dtrace_lock);
17544 
17545                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17546                         kmem_free(buf, size);
17547                         return (EFAULT);
17548                 }
17549 
17550                 kmem_free(buf, size);
17551                 return (0);
17552         }
17553 
17554         case DTRACEIOC_AGGDESC: {
17555                 dtrace_aggdesc_t aggdesc;
17556                 dtrace_action_t *act;
17557                 dtrace_aggregation_t *agg;
17558                 int nrecs;
17559                 uint32_t offs;
17560                 dtrace_recdesc_t *lrec;
17561                 void *buf;
17562                 size_t size;
17563                 uintptr_t dest;
17564 
17565                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17566                         return (EFAULT);
17567 
17568                 mutex_enter(&dtrace_lock);
17569 
17570                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17571                         mutex_exit(&dtrace_lock);
17572                         return (EINVAL);
17573                 }
17574 
17575                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17576 
17577                 nrecs = aggdesc.dtagd_nrecs;
17578                 aggdesc.dtagd_nrecs = 0;
17579 
17580                 offs = agg->dtag_base;
17581                 lrec = &agg->dtag_action.dta_rec;
17582                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17583 
17584                 for (act = agg->dtag_first; ; act = act->dta_next) {
17585                         ASSERT(act->dta_intuple ||
17586                             DTRACEACT_ISAGG(act->dta_kind));
17587 
17588                         /*
17589                          * If this action has a record size of zero, it
17590                          * denotes an argument to the aggregating action.
17591                          * Because the presence of this record doesn't (or
17592                          * shouldn't) affect the way the data is interpreted,
17593                          * we don't copy it out to save user-level the
17594                          * confusion of dealing with a zero-length record.
17595                          */
17596                         if (act->dta_rec.dtrd_size == 0) {
17597                                 ASSERT(agg->dtag_hasarg);
17598                                 continue;
17599                         }
17600 
17601                         aggdesc.dtagd_nrecs++;
17602 
17603                         if (act == &agg->dtag_action)
17604                                 break;
17605                 }
17606 
17607                 /*
17608                  * Now that we have the size, we need to allocate a temporary
17609                  * buffer in which to store the complete description.  We need
17610                  * the temporary buffer to be able to drop dtrace_lock()
17611                  * across the copyout(), below.
17612                  */
17613                 size = sizeof (dtrace_aggdesc_t) +
17614                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17615 
17616                 buf = kmem_alloc(size, KM_SLEEP);
17617                 dest = (uintptr_t)buf;
17618 
17619                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17620                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17621 
17622                 for (act = agg->dtag_first; ; act = act->dta_next) {
17623                         dtrace_recdesc_t rec = act->dta_rec;
17624 
17625                         /*
17626                          * See the comment in the above loop for why we pass
17627                          * over zero-length records.
17628                          */
17629                         if (rec.dtrd_size == 0) {
17630                                 ASSERT(agg->dtag_hasarg);
17631                                 continue;
17632                         }
17633 
17634                         if (nrecs-- == 0)
17635                                 break;
17636 
17637                         rec.dtrd_offset -= offs;
17638                         bcopy(&rec, (void *)dest, sizeof (rec));
17639                         dest += sizeof (dtrace_recdesc_t);
17640 
17641                         if (act == &agg->dtag_action)
17642                                 break;
17643                 }
17644 
17645                 mutex_exit(&dtrace_lock);
17646 
17647                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17648                         kmem_free(buf, size);
17649                         return (EFAULT);
17650                 }
17651 
17652                 kmem_free(buf, size);
17653                 return (0);
17654         }
17655 
17656         case DTRACEIOC_ENABLE: {
17657                 dof_hdr_t *dof;
17658                 dtrace_enabling_t *enab = NULL;
17659                 dtrace_vstate_t *vstate;
17660                 int err = 0;
17661 
17662                 *rv = 0;
17663 
17664                 /*
17665                  * If a NULL argument has been passed, we take this as our
17666                  * cue to reevaluate our enablings.
17667                  */
17668                 if (arg == NULL) {
17669                         dtrace_enabling_matchall();
17670 
17671                         return (0);
17672                 }
17673 
17674                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17675                         return (rval);
17676 
17677                 mutex_enter(&cpu_lock);
17678                 mutex_enter(&dtrace_lock);
17679                 vstate = &state->dts_vstate;
17680 
17681                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17682                         mutex_exit(&dtrace_lock);
17683                         mutex_exit(&cpu_lock);
17684                         dtrace_dof_destroy(dof);
17685                         return (EBUSY);
17686                 }
17687 
17688                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17689                         mutex_exit(&dtrace_lock);
17690                         mutex_exit(&cpu_lock);
17691                         dtrace_dof_destroy(dof);
17692                         return (EINVAL);
17693                 }
17694 
17695                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17696                         dtrace_enabling_destroy(enab);
17697                         mutex_exit(&dtrace_lock);
17698                         mutex_exit(&cpu_lock);
17699                         dtrace_dof_destroy(dof);
17700                         return (rval);
17701                 }
17702 
17703                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17704                         err = dtrace_enabling_retain(enab);
17705                 } else {
17706                         dtrace_enabling_destroy(enab);
17707                 }
17708 
17709                 mutex_exit(&cpu_lock);
17710                 mutex_exit(&dtrace_lock);
17711                 dtrace_dof_destroy(dof);
17712 
17713                 return (err);
17714         }
17715 
17716         case DTRACEIOC_REPLICATE: {
17717                 dtrace_repldesc_t desc;
17718                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17719                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17720                 int err;
17721 
17722                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17723                         return (EFAULT);
17724 
17725                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17726                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17727                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17728                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17729 
17730                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17731                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17732                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17733                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17734 
17735                 mutex_enter(&dtrace_lock);
17736                 err = dtrace_enabling_replicate(state, match, create);
17737                 mutex_exit(&dtrace_lock);
17738 
17739                 return (err);
17740         }
17741 
17742         case DTRACEIOC_PROBEMATCH:
17743         case DTRACEIOC_PROBES: {
17744                 dtrace_probe_t *probe = NULL;
17745                 dtrace_probedesc_t desc;
17746                 dtrace_probekey_t pkey;
17747                 dtrace_id_t i;
17748                 int m = 0;
17749                 uint32_t priv;
17750                 uid_t uid;
17751                 zoneid_t zoneid;
17752 
17753                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17754                         return (EFAULT);
17755 
17756                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17757                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17758                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17759                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17760 
17761                 /*
17762                  * Before we attempt to match this probe, we want to give
17763                  * all providers the opportunity to provide it.
17764                  */
17765                 if (desc.dtpd_id == DTRACE_IDNONE) {
17766                         mutex_enter(&dtrace_provider_lock);
17767                         dtrace_probe_provide(&desc, NULL);
17768                         mutex_exit(&dtrace_provider_lock);
17769                         desc.dtpd_id++;
17770                 }
17771 
17772                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17773                         dtrace_probekey(&desc, &pkey);
17774                         pkey.dtpk_id = DTRACE_IDNONE;
17775                 }
17776 
17777                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17778 
17779                 mutex_enter(&dtrace_lock);
17780 
17781                 if (cmd == DTRACEIOC_PROBEMATCH) {
17782                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17783                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17784                                     (m = dtrace_match_probe(probe, &pkey,
17785                                     priv, uid, zoneid)) != 0)
17786                                         break;
17787                         }
17788 
17789                         if (m < 0) {
17790                                 mutex_exit(&dtrace_lock);
17791                                 return (EINVAL);
17792                         }
17793 
17794                 } else {
17795                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17796                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17797                                     dtrace_match_priv(probe, priv, uid, zoneid))
17798                                         break;
17799                         }
17800                 }
17801 
17802                 if (probe == NULL) {
17803                         mutex_exit(&dtrace_lock);
17804                         return (ESRCH);
17805                 }
17806 
17807                 dtrace_probe_description(probe, &desc);
17808                 mutex_exit(&dtrace_lock);
17809 
17810                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17811                         return (EFAULT);
17812 
17813                 return (0);
17814         }
17815 
17816         case DTRACEIOC_PROBEARG: {
17817                 dtrace_argdesc_t desc;
17818                 dtrace_probe_t *probe;
17819                 dtrace_provider_t *prov;
17820 
17821                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17822                         return (EFAULT);
17823 
17824                 if (desc.dtargd_id == DTRACE_IDNONE)
17825                         return (EINVAL);
17826 
17827                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17828                         return (EINVAL);
17829 
17830                 mutex_enter(&dtrace_provider_lock);
17831                 mutex_enter(&mod_lock);
17832                 mutex_enter(&dtrace_lock);
17833 
17834                 if (desc.dtargd_id > dtrace_nprobes) {
17835                         mutex_exit(&dtrace_lock);
17836                         mutex_exit(&mod_lock);
17837                         mutex_exit(&dtrace_provider_lock);
17838                         return (EINVAL);
17839                 }
17840 
17841                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17842                         mutex_exit(&dtrace_lock);
17843                         mutex_exit(&mod_lock);
17844                         mutex_exit(&dtrace_provider_lock);
17845                         return (EINVAL);
17846                 }
17847 
17848                 mutex_exit(&dtrace_lock);
17849 
17850                 prov = probe->dtpr_provider;
17851 
17852                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17853                         /*
17854                          * There isn't any typed information for this probe.
17855                          * Set the argument number to DTRACE_ARGNONE.
17856                          */
17857                         desc.dtargd_ndx = DTRACE_ARGNONE;
17858                 } else {
17859                         desc.dtargd_native[0] = '\0';
17860                         desc.dtargd_xlate[0] = '\0';
17861                         desc.dtargd_mapping = desc.dtargd_ndx;
17862 
17863                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17864                             probe->dtpr_id, probe->dtpr_arg, &desc);
17865                 }
17866 
17867                 mutex_exit(&mod_lock);
17868                 mutex_exit(&dtrace_provider_lock);
17869 
17870                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17871                         return (EFAULT);
17872 
17873                 return (0);
17874         }
17875 
17876         case DTRACEIOC_GO: {
17877                 processorid_t cpuid;
17878                 rval = dtrace_state_go(state, &cpuid);
17879 
17880                 if (rval != 0)
17881                         return (rval);
17882 
17883                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17884                         return (EFAULT);
17885 
17886                 return (0);
17887         }
17888 
17889         case DTRACEIOC_STOP: {
17890                 processorid_t cpuid;
17891 
17892                 mutex_enter(&dtrace_lock);
17893                 rval = dtrace_state_stop(state, &cpuid);
17894                 mutex_exit(&dtrace_lock);
17895 
17896                 if (rval != 0)
17897                         return (rval);
17898 
17899                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17900                         return (EFAULT);
17901 
17902                 return (0);
17903         }
17904 
17905         case DTRACEIOC_DOFGET: {
17906                 dof_hdr_t hdr, *dof;
17907                 uint64_t len;
17908 
17909                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17910                         return (EFAULT);
17911 
17912                 mutex_enter(&dtrace_lock);
17913                 dof = dtrace_dof_create(state);
17914                 mutex_exit(&dtrace_lock);
17915 
17916                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17917                 rval = copyout(dof, (void *)arg, len);
17918                 dtrace_dof_destroy(dof);
17919 
17920                 return (rval == 0 ? 0 : EFAULT);
17921         }
17922 
17923         case DTRACEIOC_AGGSNAP:
17924         case DTRACEIOC_BUFSNAP: {
17925                 dtrace_bufdesc_t desc;
17926                 caddr_t cached;
17927                 dtrace_buffer_t *buf;
17928 
17929                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17930                         return (EFAULT);
17931 
17932                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17933                         return (EINVAL);
17934 
17935                 mutex_enter(&dtrace_lock);
17936 
17937                 if (cmd == DTRACEIOC_BUFSNAP) {
17938                         buf = &state->dts_buffer[desc.dtbd_cpu];
17939                 } else {
17940                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17941                 }
17942 
17943                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17944                         size_t sz = buf->dtb_offset;
17945 
17946                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17947                                 mutex_exit(&dtrace_lock);
17948                                 return (EBUSY);
17949                         }
17950 
17951                         /*
17952                          * If this buffer has already been consumed, we're
17953                          * going to indicate that there's nothing left here
17954                          * to consume.
17955                          */
17956                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17957                                 mutex_exit(&dtrace_lock);
17958 
17959                                 desc.dtbd_size = 0;
17960                                 desc.dtbd_drops = 0;
17961                                 desc.dtbd_errors = 0;
17962                                 desc.dtbd_oldest = 0;
17963                                 sz = sizeof (desc);
17964 
17965                                 if (copyout(&desc, (void *)arg, sz) != 0)
17966                                         return (EFAULT);
17967 
17968                                 return (0);
17969                         }
17970 
17971                         /*
17972                          * If this is a ring buffer that has wrapped, we want
17973                          * to copy the whole thing out.
17974                          */
17975                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17976                                 dtrace_buffer_polish(buf);
17977                                 sz = buf->dtb_size;
17978                         }
17979 
17980                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17981                                 mutex_exit(&dtrace_lock);
17982                                 return (EFAULT);
17983                         }
17984 
17985                         desc.dtbd_size = sz;
17986                         desc.dtbd_drops = buf->dtb_drops;
17987                         desc.dtbd_errors = buf->dtb_errors;
17988                         desc.dtbd_oldest = buf->dtb_xamot_offset;
17989                         desc.dtbd_timestamp = dtrace_gethrtime();
17990 
17991                         mutex_exit(&dtrace_lock);
17992 
17993                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17994                                 return (EFAULT);
17995 
17996                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
17997 
17998                         return (0);
17999                 }
18000 
18001                 if (buf->dtb_tomax == NULL) {
18002                         ASSERT(buf->dtb_xamot == NULL);
18003                         mutex_exit(&dtrace_lock);
18004                         return (ENOENT);
18005                 }
18006 
18007                 cached = buf->dtb_tomax;
18008                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18009 
18010                 dtrace_xcall(desc.dtbd_cpu,
18011                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
18012 
18013                 state->dts_errors += buf->dtb_xamot_errors;
18014 
18015                 /*
18016                  * If the buffers did not actually switch, then the cross call
18017                  * did not take place -- presumably because the given CPU is
18018                  * not in the ready set.  If this is the case, we'll return
18019                  * ENOENT.
18020                  */
18021                 if (buf->dtb_tomax == cached) {
18022                         ASSERT(buf->dtb_xamot != cached);
18023                         mutex_exit(&dtrace_lock);
18024                         return (ENOENT);
18025                 }
18026 
18027                 ASSERT(cached == buf->dtb_xamot);
18028 
18029                 /*
18030                  * We have our snapshot; now copy it out.
18031                  */
18032                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
18033                     buf->dtb_xamot_offset) != 0) {
18034                         mutex_exit(&dtrace_lock);
18035                         return (EFAULT);
18036                 }
18037 
18038                 desc.dtbd_size = buf->dtb_xamot_offset;
18039                 desc.dtbd_drops = buf->dtb_xamot_drops;
18040                 desc.dtbd_errors = buf->dtb_xamot_errors;
18041                 desc.dtbd_oldest = 0;
18042                 desc.dtbd_timestamp = buf->dtb_switched;
18043 
18044                 mutex_exit(&dtrace_lock);
18045 
18046                 /*
18047                  * Finally, copy out the buffer description.
18048                  */
18049                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
18050                         return (EFAULT);
18051 
18052                 return (0);
18053         }
18054 
18055         case DTRACEIOC_CONF: {
18056                 dtrace_conf_t conf;
18057 
18058                 bzero(&conf, sizeof (conf));
18059                 conf.dtc_difversion = DIF_VERSION;
18060                 conf.dtc_difintregs = DIF_DIR_NREGS;
18061                 conf.dtc_diftupregs = DIF_DTR_NREGS;
18062                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18063 
18064                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
18065                         return (EFAULT);
18066 
18067                 return (0);
18068         }
18069 
18070         case DTRACEIOC_STATUS: {
18071                 dtrace_status_t stat;
18072                 dtrace_dstate_t *dstate;
18073                 int i, j;
18074                 uint64_t nerrs;
18075 
18076                 /*
18077                  * See the comment in dtrace_state_deadman() for the reason
18078                  * for setting dts_laststatus to INT64_MAX before setting
18079                  * it to the correct value.
18080                  */
18081                 state->dts_laststatus = INT64_MAX;
18082                 dtrace_membar_producer();
18083                 state->dts_laststatus = dtrace_gethrtime();
18084 
18085                 bzero(&stat, sizeof (stat));
18086 
18087                 mutex_enter(&dtrace_lock);
18088 
18089                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18090                         mutex_exit(&dtrace_lock);
18091                         return (ENOENT);
18092                 }
18093 
18094                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18095                         stat.dtst_exiting = 1;
18096 
18097                 nerrs = state->dts_errors;
18098                 dstate = &state->dts_vstate.dtvs_dynvars;
18099 
18100                 for (i = 0; i < NCPU; i++) {
18101                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18102 
18103                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
18104                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18105                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18106 
18107                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18108                                 stat.dtst_filled++;
18109 
18110                         nerrs += state->dts_buffer[i].dtb_errors;
18111 
18112                         for (j = 0; j < state->dts_nspeculations; j++) {
18113                                 dtrace_speculation_t *spec;
18114                                 dtrace_buffer_t *buf;
18115 
18116                                 spec = &state->dts_speculations[j];
18117                                 buf = &spec->dtsp_buffer[i];
18118                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
18119                         }
18120                 }
18121 
18122                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18123                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18124                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18125                 stat.dtst_dblerrors = state->dts_dblerrors;
18126                 stat.dtst_killed =
18127                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18128                 stat.dtst_errors = nerrs;
18129 
18130                 mutex_exit(&dtrace_lock);
18131 
18132                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
18133                         return (EFAULT);
18134 
18135                 return (0);
18136         }
18137 
18138         case DTRACEIOC_FORMAT: {
18139                 dtrace_fmtdesc_t fmt;
18140                 char *str;
18141                 int len;
18142 
18143                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
18144                         return (EFAULT);
18145 
18146                 mutex_enter(&dtrace_lock);
18147 
18148                 if (fmt.dtfd_format == 0 ||
18149                     fmt.dtfd_format > state->dts_nformats) {
18150                         mutex_exit(&dtrace_lock);
18151                         return (EINVAL);
18152                 }
18153 
18154                 /*
18155                  * Format strings are allocated contiguously and they are
18156                  * never freed; if a format index is less than the number
18157                  * of formats, we can assert that the format map is non-NULL
18158                  * and that the format for the specified index is non-NULL.
18159                  */
18160                 ASSERT(state->dts_formats != NULL);
18161                 str = state->dts_formats[fmt.dtfd_format - 1];
18162                 ASSERT(str != NULL);
18163 
18164                 len = strlen(str) + 1;
18165 
18166                 if (len > fmt.dtfd_length) {
18167                         fmt.dtfd_length = len;
18168 
18169                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
18170                                 mutex_exit(&dtrace_lock);
18171                                 return (EINVAL);
18172                         }
18173                 } else {
18174                         if (copyout(str, fmt.dtfd_string, len) != 0) {
18175                                 mutex_exit(&dtrace_lock);
18176                                 return (EINVAL);
18177                         }
18178                 }
18179 
18180                 mutex_exit(&dtrace_lock);
18181                 return (0);
18182         }
18183 
18184         default:
18185                 break;
18186         }
18187 
18188         return (ENOTTY);
18189 }
18190 
18191 /*ARGSUSED*/
18192 static int
18193 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18194 {
18195         dtrace_state_t *state;
18196 
18197         switch (cmd) {
18198         case DDI_DETACH:
18199                 break;
18200 
18201         case DDI_SUSPEND:
18202                 return (DDI_SUCCESS);
18203 
18204         default:
18205                 return (DDI_FAILURE);
18206         }
18207 
18208         mutex_enter(&cpu_lock);
18209         mutex_enter(&dtrace_provider_lock);
18210         mutex_enter(&dtrace_lock);
18211 
18212         ASSERT(dtrace_opens == 0);
18213 
18214         if (dtrace_helpers > 0) {
18215                 mutex_exit(&dtrace_provider_lock);
18216                 mutex_exit(&dtrace_lock);
18217                 mutex_exit(&cpu_lock);
18218                 return (DDI_FAILURE);
18219         }
18220 
18221         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18222                 mutex_exit(&dtrace_provider_lock);
18223                 mutex_exit(&dtrace_lock);
18224                 mutex_exit(&cpu_lock);
18225                 return (DDI_FAILURE);
18226         }
18227 
18228         dtrace_provider = NULL;
18229 
18230         if ((state = dtrace_anon_grab()) != NULL) {
18231                 /*
18232                  * If there were ECBs on this state, the provider should
18233                  * have not been allowed to detach; assert that there is
18234                  * none.
18235                  */
18236                 ASSERT(state->dts_necbs == 0);
18237                 dtrace_state_destroy(state);
18238 
18239                 /*
18240                  * If we're being detached with anonymous state, we need to
18241                  * indicate to the kernel debugger that DTrace is now inactive.
18242                  */
18243                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18244         }
18245 
18246         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18247         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18248         dtrace_cpu_init = NULL;
18249         dtrace_helpers_cleanup = NULL;
18250         dtrace_helpers_fork = NULL;
18251         dtrace_cpustart_init = NULL;
18252         dtrace_cpustart_fini = NULL;
18253         dtrace_debugger_init = NULL;
18254         dtrace_debugger_fini = NULL;
18255         dtrace_modload = NULL;
18256         dtrace_modunload = NULL;
18257 
18258         ASSERT(dtrace_getf == 0);
18259         ASSERT(dtrace_closef == NULL);
18260 
18261         mutex_exit(&cpu_lock);
18262 
18263         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18264         dtrace_probes = NULL;
18265         dtrace_nprobes = 0;
18266 
18267         dtrace_hash_destroy(dtrace_bymod);
18268         dtrace_hash_destroy(dtrace_byfunc);
18269         dtrace_hash_destroy(dtrace_byname);
18270         dtrace_bymod = NULL;
18271         dtrace_byfunc = NULL;
18272         dtrace_byname = NULL;
18273 
18274         kmem_cache_destroy(dtrace_state_cache);
18275         vmem_destroy(dtrace_minor);
18276         vmem_destroy(dtrace_arena);
18277 
18278         if (dtrace_toxrange != NULL) {
18279                 kmem_free(dtrace_toxrange,
18280                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18281                 dtrace_toxrange = NULL;
18282                 dtrace_toxranges = 0;
18283                 dtrace_toxranges_max = 0;
18284         }
18285 
18286         ddi_remove_minor_node(dtrace_devi, NULL);
18287         dtrace_devi = NULL;
18288 
18289         ddi_soft_state_fini(&dtrace_softstate);
18290 
18291         ASSERT(dtrace_vtime_references == 0);
18292         ASSERT(dtrace_opens == 0);
18293         ASSERT(dtrace_retained == NULL);
18294 
18295         mutex_exit(&dtrace_lock);
18296         mutex_exit(&dtrace_provider_lock);
18297 
18298         /*
18299          * We don't destroy the task queue until after we have dropped our
18300          * locks (taskq_destroy() may block on running tasks).  To prevent
18301          * attempting to do work after we have effectively detached but before
18302          * the task queue has been destroyed, all tasks dispatched via the
18303          * task queue must check that DTrace is still attached before
18304          * performing any operation.
18305          */
18306         taskq_destroy(dtrace_taskq);
18307         dtrace_taskq = NULL;
18308 
18309         return (DDI_SUCCESS);
18310 }
18311 #endif
18312 
18313 #ifdef illumos
18314 /*ARGSUSED*/
18315 static int
18316 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
18317 {
18318         int error;
18319 
18320         switch (infocmd) {
18321         case DDI_INFO_DEVT2DEVINFO:
18322                 *result = (void *)dtrace_devi;
18323                 error = DDI_SUCCESS;
18324                 break;
18325         case DDI_INFO_DEVT2INSTANCE:
18326                 *result = (void *)0;
18327                 error = DDI_SUCCESS;
18328                 break;
18329         default:
18330                 error = DDI_FAILURE;
18331         }
18332         return (error);
18333 }
18334 #endif
18335 
18336 #ifdef illumos
18337 static struct cb_ops dtrace_cb_ops = {
18338         dtrace_open,            /* open */
18339         dtrace_close,           /* close */
18340         nulldev,                /* strategy */
18341         nulldev,                /* print */
18342         nodev,                  /* dump */
18343         nodev,                  /* read */
18344         nodev,                  /* write */
18345         dtrace_ioctl,           /* ioctl */
18346         nodev,                  /* devmap */
18347         nodev,                  /* mmap */
18348         nodev,                  /* segmap */
18349         nochpoll,               /* poll */
18350         ddi_prop_op,            /* cb_prop_op */
18351         0,                      /* streamtab  */
18352         D_NEW | D_MP            /* Driver compatibility flag */
18353 };
18354 
18355 static struct dev_ops dtrace_ops = {
18356         DEVO_REV,               /* devo_rev */
18357         0,                      /* refcnt */
18358         dtrace_info,            /* get_dev_info */
18359         nulldev,                /* identify */
18360         nulldev,                /* probe */
18361         dtrace_attach,          /* attach */
18362         dtrace_detach,          /* detach */
18363         nodev,                  /* reset */
18364         &dtrace_cb_ops,         /* driver operations */
18365         NULL,                   /* bus operations */
18366         nodev                   /* dev power */
18367 };
18368 
18369 static struct modldrv modldrv = {
18370         &mod_driverops,         /* module type (this is a pseudo driver) */
18371         "Dynamic Tracing",      /* name of module */
18372         &dtrace_ops,            /* driver ops */
18373 };
18374 
18375 static struct modlinkage modlinkage = {
18376         MODREV_1,
18377         (void *)&modldrv,
18378         NULL
18379 };
18380 
18381 int
18382 _init(void)
18383 {
18384         return (mod_install(&modlinkage));
18385 }
18386 
18387 int
18388 _info(struct modinfo *modinfop)
18389 {
18390         return (mod_info(&modlinkage, modinfop));
18391 }
18392 
18393 int
18394 _fini(void)
18395 {
18396         return (mod_remove(&modlinkage));
18397 }
18398 #else
18399 
18400 static d_ioctl_t        dtrace_ioctl;
18401 static d_ioctl_t        dtrace_ioctl_helper;
18402 static void             dtrace_load(void *);
18403 static int              dtrace_unload(void);
18404 static struct cdev      *dtrace_dev;
18405 static struct cdev      *helper_dev;
18406 
18407 void dtrace_invop_init(void);
18408 void dtrace_invop_uninit(void);
18409 
18410 static struct cdevsw dtrace_cdevsw = {
18411         .d_version      = D_VERSION,
18412         .d_ioctl        = dtrace_ioctl,
18413         .d_open         = dtrace_open,
18414         .d_name         = "dtrace",
18415 };
18416 
18417 static struct cdevsw helper_cdevsw = {
18418         .d_version      = D_VERSION,
18419         .d_ioctl        = dtrace_ioctl_helper,
18420         .d_name         = "helper",
18421 };
18422 
18423 #include <dtrace_anon.c>
18424 #include <dtrace_ioctl.c>
18425 #include <dtrace_load.c>
18426 #include <dtrace_modevent.c>
18427 #include <dtrace_sysctl.c>
18428 #include <dtrace_unload.c>
18429 #include <dtrace_vtime.c>
18430 #include <dtrace_hacks.c>
18431 #include <dtrace_isa.c>
18432 
18433 SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
18434 SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
18435 SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
18436 
18437 DEV_MODULE(dtrace, dtrace_modevent, NULL);
18438 MODULE_VERSION(dtrace, 1);
18439 MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
18440 #endif

Cache object: 54a8e0a001e90856ecfda828fa344e06


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.