blob: 6267faa7c023e05eaa053997842ffb70c8b7083c [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/*
* DTrace - Dynamic Tracing for Solaris
*
* This is the implementation of the Solaris Dynamic Tracing framework
* (DTrace). The user-visible interface to DTrace is described at length in
* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
* library, the in-kernel DTrace framework, and the DTrace providers are
* described in the block comments in the <sys/dtrace.h> header file. The
* internal architecture of DTrace is described in the block comments in the
* <sys/dtrace_impl.h> header file. The comments contained within the DTrace
* implementation very much assume mastery of all of these sources; if one has
* an unanswered question about the implementation, one should consult them
* first.
*
* The functions here are ordered roughly as follows:
*
* - Probe context functions
* - Probe hashing functions
* - Non-probe context utility functions
* - Matching functions
* - Provider-to-Framework API functions
* - Probe management functions
* - DIF object functions
* - Format functions
* - Predicate functions
* - ECB functions
* - Buffer functions
* - Enabling functions
* - DOF functions
* - Anonymous enabling functions
* - Consumer state functions
* - Helper functions
* - Hook functions
* - Driver cookbook functions
*
* Each group of functions begins with a block comment labelled the "DTrace
* [Group] Functions", allowing one to find each block by searching forward
* on capital-f functions.
*/
#include <sys/errno.h>
#include <sys/stat.h>
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/systm.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cpuvar.h>
#include <sys/kmem.h>
#include <sys/strsubr.h>
#include <sys/sysmacros.h>
#include <sys/dtrace_impl.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
#include <sys/mutex_impl.h>
#include <sys/rwlock_impl.h>
#include <sys/ctf_api.h>
#include <sys/panic.h>
#include <sys/priv_impl.h>
#include <sys/policy.h>
#include <sys/cred_impl.h>
#include <sys/procfs_isa.h>
#include <sys/taskq.h>
#include <sys/mkdev.h>
#include <sys/kdi.h>
#include <sys/zone.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include "strtolctype.h"
/*
* DTrace Tunable Variables
*
* The following variables may be tuned by adding a line to /etc/system that
* includes both the name of the DTrace module ("dtrace") and the name of the
* variable. For example:
*
* set dtrace:dtrace_destructive_disallow = 1
*
* In general, the only variables that one should be tuning this way are those
* that affect system-wide DTrace behavior, and for which the default behavior
* is undesirable. Most of these variables are tunable on a per-consumer
* basis using DTrace options, and need not be tuned on a system-wide basis.
* When tuning these variables, avoid pathological values; while some attempt
* is made to verify the integrity of these variables, they are not considered
* part of the supported interface to DTrace, and they are therefore not
* checked comprehensively. Further, these variables should not be tuned
* dynamically via "mdb -kw" or other means; they should only be tuned via
* /etc/system.
*/
int dtrace_destructive_disallow = 0;
dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
size_t dtrace_difo_maxsize = (256 * 1024);
dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
size_t dtrace_statvar_maxsize = (16 * 1024);
size_t dtrace_actions_max = (16 * 1024);
size_t dtrace_retain_max = 1024;
dtrace_optval_t dtrace_helper_actions_max = 1024;
dtrace_optval_t dtrace_helper_providers_max = 32;
dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
size_t dtrace_strsize_default = 256;
dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
dtrace_optval_t dtrace_nspec_default = 1;
dtrace_optval_t dtrace_specsize_default = 32 * 1024;
dtrace_optval_t dtrace_stackframes_default = 20;
dtrace_optval_t dtrace_ustackframes_default = 20;
dtrace_optval_t dtrace_jstackframes_default = 50;
dtrace_optval_t dtrace_jstackstrsize_default = 512;
int dtrace_msgdsize_max = 128;
hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */
hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
int dtrace_devdepth_max = 32;
int dtrace_err_verbose;
hrtime_t dtrace_deadman_interval = NANOSEC;
hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
/*
* DTrace External Variables
*
* As dtrace(7D) is a kernel module, any DTrace variables are obviously
* available to DTrace consumers via the backtick (`) syntax. One of these,
* dtrace_zero, is made deliberately so: it is provided as a source of
* well-known, zero-filled memory. While this variable is not documented,
* it is used by some translators as an implementation detail.
*/
const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
/*
* DTrace Internal Variables
*/
static dev_info_t *dtrace_devi; /* device info */
static vmem_t *dtrace_arena; /* probe ID arena */
static vmem_t *dtrace_minor; /* minor number arena */
static taskq_t *dtrace_taskq; /* task queue */
static dtrace_probe_t **dtrace_probes; /* array of all probes */
static int dtrace_nprobes; /* number of probes */
static dtrace_provider_t *dtrace_provider; /* provider list */
static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
static int dtrace_opens; /* number of opens */
static int dtrace_helpers; /* number of helpers */
static int dtrace_getf; /* number of unpriv getf()s */
static void *dtrace_softstate; /* softstate pointer */
static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
static int dtrace_toxranges; /* number of toxic ranges */
static int dtrace_toxranges_max; /* size of toxic range array */
static dtrace_anon_t dtrace_anon; /* anonymous enabling */
static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
static kthread_t *dtrace_panicked; /* panicking thread */
static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
static dtrace_genid_t dtrace_probegen; /* current probe generation */
static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
static int dtrace_dynvar_failclean; /* dynvars failed to clean */
/*
* DTrace Locking
* DTrace is protected by three (relatively coarse-grained) locks:
*
* (1) dtrace_lock is required to manipulate essentially any DTrace state,
* including enabling state, probes, ECBs, consumer state, helper state,
* etc. Importantly, dtrace_lock is _not_ required when in probe context;
* probe context is lock-free -- synchronization is handled via the
* dtrace_sync() cross call mechanism.
*
* (2) dtrace_provider_lock is required when manipulating provider state, or
* when provider state must be held constant.
*
* (3) dtrace_meta_lock is required when manipulating meta provider state, or
* when meta provider state must be held constant.
*
* The lock ordering between these three locks is dtrace_meta_lock before
* dtrace_provider_lock before dtrace_lock. (In particular, there are
* several places where dtrace_provider_lock is held by the framework as it
* calls into the providers -- which then call back into the framework,
* grabbing dtrace_lock.)
*
* There are two other locks in the mix: mod_lock and cpu_lock. With respect
* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
* role as a coarse-grained lock; it is acquired before both of these locks.
* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
* be acquired _between_ dtrace_meta_lock and any other DTrace locks.
* mod_lock is similar with respect to dtrace_provider_lock in that it must be
* acquired _between_ dtrace_provider_lock and dtrace_lock.
*/
static kmutex_t dtrace_lock; /* probe state lock */
static kmutex_t dtrace_provider_lock; /* provider state lock */
static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
/*
* DTrace Provider Variables
*
* These are the variables relating to DTrace as a provider (that is, the
* provider of the BEGIN, END, and ERROR probes).
*/
static dtrace_pattr_t dtrace_provider_attr = {
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
};
static void
dtrace_nullop(void)
{}
static int
dtrace_enable_nullop(void)
{
return (0);
}
static dtrace_pops_t dtrace_provider_ops = {
(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
(void (*)(void *, struct modctl *))dtrace_nullop,
(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
NULL,
NULL,
NULL,
(void (*)(void *, dtrace_id_t, void *))dtrace_nullop
};
static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
static dtrace_id_t dtrace_probeid_end; /* special END probe */
dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
/*
* DTrace Helper Tracing Variables
*
* These variables should be set dynamically to enable helper tracing. The
* only variables that should be set are dtrace_helptrace_enable (which should
* be set to a non-zero value to allocate helper tracing buffers on the next
* open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
* non-zero value to deallocate helper tracing buffers on the next close of
* /dev/dtrace). When (and only when) helper tracing is disabled, the
* buffer size may also be set via dtrace_helptrace_bufsize.
*/
int dtrace_helptrace_enable = 0;
int dtrace_helptrace_disable = 0;
int dtrace_helptrace_bufsize = 16 * 1024 * 1024;
uint32_t dtrace_helptrace_nlocals;
static dtrace_helptrace_t *dtrace_helptrace_buffer;
static uint32_t dtrace_helptrace_next = 0;
static int dtrace_helptrace_wrapped = 0;
/*
* DTrace Error Hashing
*
* On DEBUG kernels, DTrace will track the errors that has seen in a hash
* table. This is very useful for checking coverage of tests that are
* expected to induce DIF or DOF processing errors, and may be useful for
* debugging problems in the DIF code generator or in DOF generation . The
* error hash may be examined with the ::dtrace_errhash MDB dcmd.
*/
#ifdef DEBUG
static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
static const char *dtrace_errlast;
static kthread_t *dtrace_errthread;
static kmutex_t dtrace_errlock;
#endif
/*
* DTrace Macros and Constants
*
* These are various macros that are useful in various spots in the
* implementation, along with a few random constants that have no meaning
* outside of the implementation. There is no real structure to this cpp
* mishmash -- but is there ever?
*/
#define DTRACE_HASHSTR(hash, probe) \
dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
#define DTRACE_HASHNEXT(hash, probe) \
(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
#define DTRACE_HASHPREV(hash, probe) \
(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
#define DTRACE_HASHEQ(hash, lhs, rhs) \
(strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
*((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
#define DTRACE_AGGHASHSIZE_SLEW 17
#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
/*
* The key for a thread-local variable consists of the lower 61 bits of the
* t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
* We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
* equal to a variable identifier. This is necessary (but not sufficient) to
* assure that global associative arrays never collide with thread-local
* variables. To guarantee that they cannot collide, we must also define the
* order for keying dynamic variables. That order is:
*
* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
*
* Because the variable-key and the tls-key are in orthogonal spaces, there is
* no way for a global variable key signature to match a thread-local key
* signature.
*/
#define DTRACE_TLS_THRKEY(where) { \
uint_t intr = 0; \
uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
for (; actv; actv >>= 1) \
intr++; \
ASSERT(intr < (1 << 3)); \
(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
(((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
}
#define DT_BSWAP_8(x) ((x) & 0xff)
#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
#define DT_MASK_LO 0x00000000FFFFFFFFULL
#define DTRACE_STORE(type, tomax, offset, what) \
*((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
#ifndef __x86
#define DTRACE_ALIGNCHECK(addr, size, flags) \
if (addr & (size - 1)) { \
*flags |= CPU_DTRACE_BADALIGN; \
cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
return (0); \
}
#else
#define DTRACE_ALIGNCHECK(addr, size, flags)
#endif
/*
* Test whether a range of memory starting at testaddr of size testsz falls
* within the range of memory described by addr, sz. We take care to avoid
* problems with overflow and underflow of the unsigned quantities, and
* disallow all negative sizes. Ranges of size 0 are allowed.
*/
#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
(testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
(testaddr) + (testsz) >= (testaddr))
#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
do { \
if ((remp) != NULL) { \
*(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
} \
_NOTE(CONSTCOND) } while (0)
/*
* Test whether alloc_sz bytes will fit in the scratch region. We isolate
* alloc_sz on the righthand side of the comparison in order to avoid overflow
* or underflow in the comparison with it. This is simpler than the INRANGE
* check above, because we know that the dtms_scratch_ptr is valid in the
* range. Allocations of size zero are allowed.
*/
#define DTRACE_INSCRATCH(mstate, alloc_sz) \
((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
(mstate)->dtms_scratch_ptr >= (alloc_sz))
#define DTRACE_LOADFUNC(bits) \
/*CSTYLED*/ \
uint##bits##_t \
dtrace_load##bits(uintptr_t addr) \
{ \
size_t size = bits / NBBY; \
/*CSTYLED*/ \
uint##bits##_t rval; \
int i; \
volatile uint16_t *flags = (volatile uint16_t *) \
&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
\
DTRACE_ALIGNCHECK(addr, size, flags); \
\
for (i = 0; i < dtrace_toxranges; i++) { \
if (addr >= dtrace_toxrange[i].dtt_limit) \
continue; \
\
if (addr + size <= dtrace_toxrange[i].dtt_base) \
continue; \
\
/* \
* This address falls within a toxic region; return 0. \
*/ \
*flags |= CPU_DTRACE_BADADDR; \
cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
return (0); \
} \
\
*flags |= CPU_DTRACE_NOFAULT; \
/*CSTYLED*/ \
rval = *((volatile uint##bits##_t *)addr); \
*flags &= ~CPU_DTRACE_NOFAULT; \
\
return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
}
#ifdef _LP64
#define dtrace_loadptr dtrace_load64
#else
#define dtrace_loadptr dtrace_load32
#endif
#define DTRACE_DYNHASH_FREE 0
#define DTRACE_DYNHASH_SINK 1
#define DTRACE_DYNHASH_VALID 2
#define DTRACE_MATCH_FAIL -1
#define DTRACE_MATCH_NEXT 0
#define DTRACE_MATCH_DONE 1
#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
#define DTRACE_STATE_ALIGN 64
#define DTRACE_FLAGS2FLT(flags) \
(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
DTRACEFLT_UNKNOWN)
#define DTRACEACT_ISSTRING(act) \
((act)->dta_kind == DTRACEACT_DIFEXPR && \
(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
static size_t dtrace_strlen(const char *, size_t);
static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
static void dtrace_enabling_provide(dtrace_provider_t *);
static int dtrace_enabling_match(dtrace_enabling_t *, int *);
static void dtrace_enabling_matchall(void);
static void dtrace_enabling_reap(void);
static dtrace_state_t *dtrace_anon_grab(void);
static uint64_t dtrace_helper(int, dtrace_mstate_t *,
dtrace_state_t *, uint64_t, uint64_t);
static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
static void dtrace_buffer_drop(dtrace_buffer_t *);
static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
dtrace_state_t *, dtrace_mstate_t *);
static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
dtrace_optval_t);
static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
static void dtrace_getf_barrier(void);
static int dtrace_canload_remains(uint64_t, size_t, size_t *,
dtrace_mstate_t *, dtrace_vstate_t *);
static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
dtrace_mstate_t *, dtrace_vstate_t *);
/*
* DTrace Probe Context Functions
*
* These functions are called from probe context. Because probe context is
* any context in which C may be called, arbitrarily locks may be held,
* interrupts may be disabled, we may be in arbitrary dispatched state, etc.
* As a result, functions called from probe context may only call other DTrace
* support functions -- they may not interact at all with the system at large.
* (Note that the ASSERT macro is made probe-context safe by redefining it in
* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
* loads are to be performed from probe context, they _must_ be in terms of
* the safe dtrace_load*() variants.
*
* Some functions in this block are not actually called from probe context;
* for these functions, there will be a comment above the function reading
* "Note: not called from probe context."
*/
void
dtrace_panic(const char *format, ...)
{
va_list alist;
va_start(alist, format);
dtrace_vpanic(format, alist);
va_end(alist);
}
int
dtrace_assfail(const char *a, const char *f, int l)
{
dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
/*
* We just need something here that even the most clever compiler
* cannot optimize away.
*/
return (a[(uintptr_t)f]);
}
/*
* Atomically increment a specified error counter from probe context.
*/
static void
dtrace_error(uint32_t *counter)
{
/*
* Most counters stored to in probe context are per-CPU counters.
* However, there are some error conditions that are sufficiently
* arcane that they don't merit per-CPU storage. If these counters
* are incremented concurrently on different CPUs, scalability will be
* adversely affected -- but we don't expect them to be white-hot in a
* correctly constructed enabling...
*/
uint32_t oval, nval;
do {
oval = *counter;
if ((nval = oval + 1) == 0) {
/*
* If the counter would wrap, set it to 1 -- assuring
* that the counter is never zero when we have seen
* errors. (The counter must be 32-bits because we
* aren't guaranteed a 64-bit compare&swap operation.)
* To save this code both the infamy of being fingered
* by a priggish news story and the indignity of being
* the target of a neo-puritan witch trial, we're
* carefully avoiding any colorful description of the
* likelihood of this condition -- but suffice it to
* say that it is only slightly more likely than the
* overflow of predicate cache IDs, as discussed in
* dtrace_predicate_create().
*/
nval = 1;
}
} while (dtrace_cas32(counter, oval, nval) != oval);
}
/*
* Use the DTRACE_LOADFUNC macro to define functions for each of loading a
* uint8_t, a uint16_t, a uint32_t and a uint64_t.
*/
/* BEGIN CSTYLED */
DTRACE_LOADFUNC(8)
DTRACE_LOADFUNC(16)
DTRACE_LOADFUNC(32)
DTRACE_LOADFUNC(64)
/* END CSTYLED */
static int
dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
{
if (dest < mstate->dtms_scratch_base)
return (0);
if (dest + size < dest)
return (0);
if (dest + size > mstate->dtms_scratch_ptr)
return (0);
return (1);
}
static int
dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
dtrace_statvar_t **svars, int nsvars)
{
int i;
size_t maxglobalsize, maxlocalsize;
if (nsvars == 0)
return (0);
maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
maxlocalsize = maxglobalsize * NCPU;
for (i = 0; i < nsvars; i++) {
dtrace_statvar_t *svar = svars[i];
uint8_t scope;
size_t size;
if (svar == NULL || (size = svar->dtsv_size) == 0)
continue;
scope = svar->dtsv_var.dtdv_scope;
/*
* We verify that our size is valid in the spirit of providing
* defense in depth: we want to prevent attackers from using
* DTrace to escalate an orthogonal kernel heap corruption bug
* into the ability to store to arbitrary locations in memory.
*/
VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
(scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
svar->dtsv_size)) {
DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
svar->dtsv_size);
return (1);
}
}
return (0);
}
/*
* Check to see if the address is within a memory region to which a store may
* be issued. This includes the DTrace scratch areas, and any DTrace variable
* region. The caller of dtrace_canstore() is responsible for performing any
* alignment checks that are needed before stores are actually executed.
*/
static int
dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
}
/*
* Implementation of dtrace_canstore which communicates the upper bound of the
* allowed memory region.
*/
static int
dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
/*
* First, check to see if the address is in scratch space...
*/
if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
mstate->dtms_scratch_size)) {
DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
mstate->dtms_scratch_size);
return (1);
}
/*
* Now check to see if it's a dynamic variable. This check will pick
* up both thread-local variables and any global dynamically-allocated
* variables.
*/
if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
vstate->dtvs_dynvars.dtds_size)) {
dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
uintptr_t base = (uintptr_t)dstate->dtds_base +
(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
uintptr_t chunkoffs;
dtrace_dynvar_t *dvar;
/*
* Before we assume that we can store here, we need to make
* sure that it isn't in our metadata -- storing to our
* dynamic variable metadata would corrupt our state. For
* the range to not include any dynamic variable metadata,
* it must:
*
* (1) Start above the hash table that is at the base of
* the dynamic variable space
*
* (2) Have a starting chunk offset that is beyond the
* dtrace_dynvar_t that is at the base of every chunk
*
* (3) Not span a chunk boundary
*
* (4) Not be in the tuple space of a dynamic variable
*
*/
if (addr < base)
return (0);
chunkoffs = (addr - base) % dstate->dtds_chunksize;
if (chunkoffs < sizeof (dtrace_dynvar_t))
return (0);
if (chunkoffs + sz > dstate->dtds_chunksize)
return (0);
dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
return (0);
if (chunkoffs < sizeof (dtrace_dynvar_t) +
((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
return (0);
DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
return (1);
}
/*
* Finally, check the static local and global variables. These checks
* take the longest, so we perform them last.
*/
if (dtrace_canstore_statvar(addr, sz, remain,
vstate->dtvs_locals, vstate->dtvs_nlocals))
return (1);
if (dtrace_canstore_statvar(addr, sz, remain,
vstate->dtvs_globals, vstate->dtvs_nglobals))
return (1);
return (0);
}
/*
* Convenience routine to check to see if the address is within a memory
* region in which a load may be issued given the user's privilege level;
* if not, it sets the appropriate error flags and loads 'addr' into the
* illegal value slot.
*
* DTrace subroutines (DIF_SUBR_*) should use this helper to implement
* appropriate memory access protection.
*/
static int
dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
dtrace_vstate_t *vstate)
{
return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
}
/*
* Implementation of dtrace_canload which communicates the upper bound of the
* allowed memory region.
*/
static int
dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
file_t *fp;
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
return (1);
}
/*
* You can obviously read that which you can store.
*/
if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
return (1);
/*
* We're allowed to read from our own string table.
*/
if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
mstate->dtms_difo->dtdo_strlen)) {
DTRACE_RANGE_REMAIN(remain, addr,
mstate->dtms_difo->dtdo_strtab,
mstate->dtms_difo->dtdo_strlen);
return (1);
}
if (vstate->dtvs_state != NULL &&
dtrace_priv_proc(vstate->dtvs_state, mstate)) {
proc_t *p;
/*
* When we have privileges to the current process, there are
* several context-related kernel structures that are safe to
* read, even absent the privilege to read from kernel memory.
* These reads are safe because these structures contain only
* state that (1) we're permitted to read, (2) is harmless or
* (3) contains pointers to additional kernel state that we're
* not permitted to read (and as such, do not present an
* opportunity for privilege escalation). Finally (and
* critically), because of the nature of their relation with
* the current thread context, the memory associated with these
* structures cannot change over the duration of probe context,
* and it is therefore impossible for this memory to be
* deallocated and reallocated as something else while it's
* being operated upon.
*/
if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
DTRACE_RANGE_REMAIN(remain, addr, curthread,
sizeof (kthread_t));
return (1);
}
if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
sz, curthread->t_procp, sizeof (proc_t))) {
DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
sizeof (proc_t));
return (1);
}
if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
curthread->t_cred, sizeof (cred_t))) {
DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
sizeof (cred_t));
return (1);
}
if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
&(p->p_pidp->pid_id), sizeof (pid_t))) {
DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
sizeof (pid_t));
return (1);
}
if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
offsetof(cpu_t, cpu_pause_thread));
return (1);
}
}
if ((fp = mstate->dtms_getf) != NULL) {
uintptr_t psz = sizeof (void *);
vnode_t *vp;
vnodeops_t *op;
/*
* When getf() returns a file_t, the enabling is implicitly
* granted the (transient) right to read the returned file_t
* as well as the v_path and v_op->vnop_name of the underlying
* vnode. These accesses are allowed after a successful
* getf() because the members that they refer to cannot change
* once set -- and the barrier logic in the kernel's closef()
* path assures that the file_t and its referenced vode_t
* cannot themselves be stale (that is, it impossible for
* either dtms_getf itself or its f_vnode member to reference
* freed memory).
*/
if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
return (1);
}
if ((vp = fp->f_vnode) != NULL) {
size_t slen;
if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
psz);
return (1);
}
slen = strlen(vp->v_path) + 1;
if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
slen);
return (1);
}
if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
psz);
return (1);
}
if ((op = vp->v_op) != NULL &&
DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
DTRACE_RANGE_REMAIN(remain, addr,
&op->vnop_name, psz);
return (1);
}
if (op != NULL && op->vnop_name != NULL &&
DTRACE_INRANGE(addr, sz, op->vnop_name,
(slen = strlen(op->vnop_name) + 1))) {
DTRACE_RANGE_REMAIN(remain, addr,
op->vnop_name, slen);
return (1);
}
}
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
*illval = addr;
return (0);
}
/*
* Convenience routine to check to see if a given string is within a memory
* region in which a load may be issued given the user's privilege level;
* this exists so that we don't need to issue unnecessary dtrace_strlen()
* calls in the event that the user has all privileges.
*/
static int
dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
size_t rsize;
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
return (1);
}
/*
* Even if the caller is uninterested in querying the remaining valid
* range, it is required to ensure that the access is allowed.
*/
if (remain == NULL) {
remain = &rsize;
}
if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
size_t strsz;
/*
* Perform the strlen after determining the length of the
* memory region which is accessible. This prevents timing
* information from being used to find NULs in memory which is
* not accessible to the caller.
*/
strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
MIN(sz, *remain));
if (strsz <= *remain) {
return (1);
}
}
return (0);
}
/*
* Convenience routine to check to see if a given variable is within a memory
* region in which a load may be issued given the user's privilege level.
*/
static int
dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
size_t sz;
ASSERT(type->dtdt_flags & DIF_TF_BYREF);
/*
* Calculate the max size before performing any checks since even
* DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
* return the max length via 'remain'.
*/
if (type->dtdt_kind == DIF_TYPE_STRING) {
dtrace_state_t *state = vstate->dtvs_state;
if (state != NULL) {
sz = state->dts_options[DTRACEOPT_STRSIZE];
} else {
/*
* In helper context, we have a NULL state; fall back
* to using the system-wide default for the string size
* in this case.
*/
sz = dtrace_strsize_default;
}
} else {
sz = type->dtdt_size;
}
/*
* If we hold the privilege to read from kernel memory, then
* everything is readable.
*/
if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
return (1);
}
if (type->dtdt_kind == DIF_TYPE_STRING) {
return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
vstate));
}
return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
vstate));
}
/*
* Convert a string to a signed integer using safe loads.
*
* NOTE: This function uses various macros from strtolctype.h to manipulate
* digit values, etc -- these have all been checked to ensure they make
* no additional function calls.
*/
static int64_t
dtrace_strtoll(char *input, int base, size_t limit)
{
uintptr_t pos = (uintptr_t)input;
int64_t val = 0;
int x;
boolean_t neg = B_FALSE;
char c, cc, ccc;
uintptr_t end = pos + limit;
/*
* Consume any whitespace preceding digits.
*/
while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
pos++;
/*
* Handle an explicit sign if one is present.
*/
if (c == '-' || c == '+') {
if (c == '-')
neg = B_TRUE;
c = dtrace_load8(++pos);
}
/*
* Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
* if present.
*/
if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
pos += 2;
c = ccc;
}
/*
* Read in contiguous digits until the first non-digit character.
*/
for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
c = dtrace_load8(++pos))
val = val * base + x;
return (neg ? -val : val);
}
/*
* Compare two strings using safe loads.
*/
static int
dtrace_strncmp(char *s1, char *s2, size_t limit)
{
uint8_t c1, c2;
volatile uint16_t *flags;
if (s1 == s2 || limit == 0)
return (0);
flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
do {
if (s1 == NULL) {
c1 = '\0';
} else {
c1 = dtrace_load8((uintptr_t)s1++);
}
if (s2 == NULL) {
c2 = '\0';
} else {
c2 = dtrace_load8((uintptr_t)s2++);
}
if (c1 != c2)
return (c1 - c2);
} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
return (0);
}
/*
* Compute strlen(s) for a string using safe memory accesses. The additional
* len parameter is used to specify a maximum length to ensure completion.
*/
static size_t
dtrace_strlen(const char *s, size_t lim)
{
uint_t len;
for (len = 0; len != lim; len++) {
if (dtrace_load8((uintptr_t)s++) == '\0')
break;
}
return (len);
}
/*
* Check if an address falls within a toxic region.
*/
static int
dtrace_istoxic(uintptr_t kaddr, size_t size)
{
uintptr_t taddr, tsize;
int i;
for (i = 0; i < dtrace_toxranges; i++) {
taddr = dtrace_toxrange[i].dtt_base;
tsize = dtrace_toxrange[i].dtt_limit - taddr;
if (kaddr - taddr < tsize) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
return (1);
}
if (taddr - kaddr < size) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
return (1);
}
}
return (0);
}
/*
* Copy src to dst using safe memory accesses. The src is assumed to be unsafe
* memory specified by the DIF program. The dst is assumed to be safe memory
* that we can store to directly because it is managed by DTrace. As with
* standard bcopy, overlapping copies are handled properly.
*/
static void
dtrace_bcopy(const void *src, void *dst, size_t len)
{
if (len != 0) {
uint8_t *s1 = dst;
const uint8_t *s2 = src;
if (s1 <= s2) {
do {
*s1++ = dtrace_load8((uintptr_t)s2++);
} while (--len != 0);
} else {
s2 += len;
s1 += len;
do {
*--s1 = dtrace_load8((uintptr_t)--s2);
} while (--len != 0);
}
}
}
/*
* Copy src to dst using safe memory accesses, up to either the specified
* length, or the point that a nul byte is encountered. The src is assumed to
* be unsafe memory specified by the DIF program. The dst is assumed to be
* safe memory that we can store to directly because it is managed by DTrace.
* Unlike dtrace_bcopy(), overlapping regions are not handled.
*/
static void
dtrace_strcpy(const void *src, void *dst, size_t len)
{
if (len != 0) {
uint8_t *s1 = dst, c;
const uint8_t *s2 = src;
do {
*s1++ = c = dtrace_load8((uintptr_t)s2++);
} while (--len != 0 && c != '\0');
}
}
/*
* Copy src to dst, deriving the size and type from the specified (BYREF)
* variable type. The src is assumed to be unsafe memory specified by the DIF
* program. The dst is assumed to be DTrace variable memory that is of the
* specified type; we assume that we can store to directly.
*/
static void
dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
{
ASSERT(type->dtdt_flags & DIF_TF_BYREF);
if (type->dtdt_kind == DIF_TYPE_STRING) {
dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
} else {
dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
}
}
/*
* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
* unsafe memory specified by the DIF program. The s2 data is assumed to be
* safe memory that we can access directly because it is managed by DTrace.
*/
static int
dtrace_bcmp(const void *s1, const void *s2, size_t len)
{
volatile uint16_t *flags;
flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
if (s1 == s2)
return (0);
if (s1 == NULL || s2 == NULL)
return (1);
if (s1 != s2 && len != 0) {
const uint8_t *ps1 = s1;
const uint8_t *ps2 = s2;
do {
if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
return (1);
} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
}
return (0);
}
/*
* Zero the specified region using a simple byte-by-byte loop. Note that this
* is for safe DTrace-managed memory only.
*/
static void
dtrace_bzero(void *dst, size_t len)
{
uchar_t *cp;
for (cp = dst; len != 0; len--)
*cp++ = 0;
}
static void
dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
{
uint64_t result[2];
result[0] = addend1[0] + addend2[0];
result[1] = addend1[1] + addend2[1] +
(result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
sum[0] = result[0];
sum[1] = result[1];
}
/*
* Shift the 128-bit value in a by b. If b is positive, shift left.
* If b is negative, shift right.
*/
static void
dtrace_shift_128(uint64_t *a, int b)
{
uint64_t mask;
if (b == 0)
return;
if (b < 0) {
b = -b;
if (b >= 64) {
a[0] = a[1] >> (b - 64);
a[1] = 0;
} else {
a[0] >>= b;
mask = 1LL << (64 - b);
mask -= 1;
a[0] |= ((a[1] & mask) << (64 - b));
a[1] >>= b;
}
} else {
if (b >= 64) {
a[1] = a[0] << (b - 64);
a[0] = 0;
} else {
a[1] <<= b;
mask = a[0] >> (64 - b);
a[1] |= mask;
a[0] <<= b;
}
}
}
/*
* The basic idea is to break the 2 64-bit values into 4 32-bit values,
* use native multiplication on those, and then re-combine into the
* resulting 128-bit value.
*
* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
* hi1 * hi2 << 64 +
* hi1 * lo2 << 32 +
* hi2 * lo1 << 32 +
* lo1 * lo2
*/
static void
dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
{
uint64_t hi1, hi2, lo1, lo2;
uint64_t tmp[2];
hi1 = factor1 >> 32;
hi2 = factor2 >> 32;
lo1 = factor1 & DT_MASK_LO;
lo2 = factor2 & DT_MASK_LO;
product[0] = lo1 * lo2;
product[1] = hi1 * hi2;
tmp[0] = hi1 * lo2;
tmp[1] = 0;
dtrace_shift_128(tmp, 32);
dtrace_add_128(product, tmp, product);
tmp[0] = hi2 * lo1;
tmp[1] = 0;
dtrace_shift_128(tmp, 32);
dtrace_add_128(product, tmp, product);
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the user credentials of the process that enabled the
* invoking ECB match the target credentials
*/
static int
dtrace_priv_proc_common_user(dtrace_state_t *state)
{
cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
/*
* We should always have a non-NULL state cred here, since if cred
* is null (anonymous tracing), we fast-path bypass this routine.
*/
ASSERT(s_cr != NULL);
if ((cr = CRED()) != NULL &&
s_cr->cr_uid == cr->cr_uid &&
s_cr->cr_uid == cr->cr_ruid &&
s_cr->cr_uid == cr->cr_suid &&
s_cr->cr_gid == cr->cr_gid &&
s_cr->cr_gid == cr->cr_rgid &&
s_cr->cr_gid == cr->cr_sgid)
return (1);
return (0);
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the zone of the process that enabled the invoking ECB
* matches the target credentials
*/
static int
dtrace_priv_proc_common_zone(dtrace_state_t *state)
{
cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
/*
* We should always have a non-NULL state cred here, since if cred
* is null (anonymous tracing), we fast-path bypass this routine.
*/
ASSERT(s_cr != NULL);
if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
return (1);
return (0);
}
/*
* This privilege check should be used by actions and subroutines to
* verify that the process has not setuid or changed credentials.
*/
static int
dtrace_priv_proc_common_nocd()
{
proc_t *proc;
if ((proc = ttoproc(curthread)) != NULL &&
!(proc->p_flag & SNOCD))
return (1);
return (0);
}
static int
dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
{
int action = state->dts_cred.dcr_action;
if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
goto bad;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
dtrace_priv_proc_common_zone(state) == 0)
goto bad;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
dtrace_priv_proc_common_user(state) == 0)
goto bad;
if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
dtrace_priv_proc_common_nocd() == 0)
goto bad;
return (1);
bad:
cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
{
if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
return (1);
if (dtrace_priv_proc_common_zone(state) &&
dtrace_priv_proc_common_user(state) &&
dtrace_priv_proc_common_nocd())
return (1);
}
cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
{
if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
(state->dts_cred.dcr_action & DTRACE_CRA_PROC))
return (1);
cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
return (0);
}
static int
dtrace_priv_kernel(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
return (1);
cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
return (0);
}
static int
dtrace_priv_kernel_destructive(dtrace_state_t *state)
{
if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
return (1);
cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
return (0);
}
/*
* Determine if the dte_cond of the specified ECB allows for processing of
* the current probe to continue. Note that this routine may allow continued
* processing, but with access(es) stripped from the mstate's dtms_access
* field.
*/
static int
dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
dtrace_ecb_t *ecb)
{
dtrace_probe_t *probe = ecb->dte_probe;
dtrace_provider_t *prov = probe->dtpr_provider;
dtrace_pops_t *pops = &prov->dtpv_pops;
int mode = DTRACE_MODE_NOPRIV_DROP;
ASSERT(ecb->dte_cond);
if (pops->dtps_mode != NULL) {
mode = pops->dtps_mode(prov->dtpv_arg,
probe->dtpr_id, probe->dtpr_arg);
ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
DTRACE_MODE_NOPRIV_DROP));
}
/*
* If the dte_cond bits indicate that this consumer is only allowed to
* see user-mode firings of this probe, check that the probe was fired
* while in a user context. If that's not the case, use the policy
* specified by the provider to determine if we drop the probe or
* merely restrict operation.
*/
if (ecb->dte_cond & DTRACE_COND_USERMODE) {
ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
if (!(mode & DTRACE_MODE_USER)) {
if (mode & DTRACE_MODE_NOPRIV_DROP)
return (0);
mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
}
}
/*
* This is more subtle than it looks. We have to be absolutely certain
* that CRED() isn't going to change out from under us so it's only
* legit to examine that structure if we're in constrained situations.
* Currently, the only times we'll this check is if a non-super-user
* has enabled the profile or syscall providers -- providers that
* allow visibility of all processes. For the profile case, the check
* above will ensure that we're examining a user context.
*/
if (ecb->dte_cond & DTRACE_COND_OWNER) {
cred_t *cr;
cred_t *s_cr = state->dts_cred.dcr_cred;
proc_t *proc;
ASSERT(s_cr != NULL);
if ((cr = CRED()) == NULL ||
s_cr->cr_uid != cr->cr_uid ||
s_cr->cr_uid != cr->cr_ruid ||
s_cr->cr_uid != cr->cr_suid ||
s_cr->cr_gid != cr->cr_gid ||
s_cr->cr_gid != cr->cr_rgid ||
s_cr->cr_gid != cr->cr_sgid ||
(proc = ttoproc(curthread)) == NULL ||
(proc->p_flag & SNOCD)) {
if (mode & DTRACE_MODE_NOPRIV_DROP)
return (0);
mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
}
}
/*
* If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
* in our zone, check to see if our mode policy is to restrict rather
* than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
* and DTRACE_ACCESS_ARGS
*/
if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
cred_t *cr;
cred_t *s_cr = state->dts_cred.dcr_cred;
ASSERT(s_cr != NULL);
if ((cr = CRED()) == NULL ||
s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
if (mode & DTRACE_MODE_NOPRIV_DROP)
return (0);
mstate->dtms_access &=
~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
}
}
/*
* By merits of being in this code path at all, we have limited
* privileges. If the provider has indicated that limited privileges
* are to denote restricted operation, strip off the ability to access
* arguments.
*/
if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
return (1);
}
/*
* Note: not called from probe context. This function is called
* asynchronously (and at a regular interval) from outside of probe context to
* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
* cleaning is explained in detail in <sys/dtrace_impl.h>.
*/
void
dtrace_dynvar_clean(dtrace_dstate_t *dstate)
{
dtrace_dynvar_t *dirty;
dtrace_dstate_percpu_t *dcpu;
dtrace_dynvar_t **rinsep;
int i, j, work = 0;
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
rinsep = &dcpu->dtdsc_rinsing;
/*
* If the dirty list is NULL, there is no dirty work to do.
*/
if (dcpu->dtdsc_dirty == NULL)
continue;
if (dcpu->dtdsc_rinsing != NULL) {
/*
* If the rinsing list is non-NULL, then it is because
* this CPU was selected to accept another CPU's
* dirty list -- and since that time, dirty buffers
* have accumulated. This is a highly unlikely
* condition, but we choose to ignore the dirty
* buffers -- they'll be picked up a future cleanse.
*/
continue;
}
if (dcpu->dtdsc_clean != NULL) {
/*
* If the clean list is non-NULL, then we're in a
* situation where a CPU has done deallocations (we
* have a non-NULL dirty list) but no allocations (we
* also have a non-NULL clean list). We can't simply
* move the dirty list into the clean list on this
* CPU, yet we also don't want to allow this condition
* to persist, lest a short clean list prevent a
* massive dirty list from being cleaned (which in
* turn could lead to otherwise avoidable dynamic
* drops). To deal with this, we look for some CPU
* with a NULL clean list, NULL dirty list, and NULL
* rinsing list -- and then we borrow this CPU to
* rinse our dirty list.
*/
for (j = 0; j < NCPU; j++) {
dtrace_dstate_percpu_t *rinser;
rinser = &dstate->dtds_percpu[j];
if (rinser->dtdsc_rinsing != NULL)
continue;
if (rinser->dtdsc_dirty != NULL)
continue;
if (rinser->dtdsc_clean != NULL)
continue;
rinsep = &rinser->dtdsc_rinsing;
break;
}
if (j == NCPU) {
/*
* We were unable to find another CPU that
* could accept this dirty list -- we are
* therefore unable to clean it now.
*/
dtrace_dynvar_failclean++;
continue;
}
}
work = 1;
/*
* Atomically move the dirty list aside.
*/
do {
dirty = dcpu->dtdsc_dirty;
/*
* Before we zap the dirty list, set the rinsing list.
* (This allows for a potential assertion in
* dtrace_dynvar(): if a free dynamic variable appears
* on a hash chain, either the dirty list or the
* rinsing list for some CPU must be non-NULL.)
*/
*rinsep = dirty;
dtrace_membar_producer();
} while (dtrace_casptr(&dcpu->dtdsc_dirty,
dirty, NULL) != dirty);
}
if (!work) {
/*
* We have no work to do; we can simply return.
*/
return;
}
dtrace_sync();
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
if (dcpu->dtdsc_rinsing == NULL)
continue;
/*
* We are now guaranteed that no hash chain contains a pointer
* into this dirty list; we can make it clean.
*/
ASSERT(dcpu->dtdsc_clean == NULL);
dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
dcpu->dtdsc_rinsing = NULL;
}
/*
* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
* sure that all CPUs have seen all of the dtdsc_clean pointers.
* This prevents a race whereby a CPU incorrectly decides that
* the state should be something other than DTRACE_DSTATE_CLEAN
* after dtrace_dynvar_clean() has completed.
*/
dtrace_sync();
dstate->dtds_state = DTRACE_DSTATE_CLEAN;
}
/*
* Depending on the value of the op parameter, this function looks-up,
* allocates or deallocates an arbitrarily-keyed dynamic variable. If an
* allocation is requested, this function will return a pointer to a
* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
* variable can be allocated. If NULL is returned, the appropriate counter
* will be incremented.
*/
dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
{
uint64_t hashval = DTRACE_DYNHASH_VALID;
dtrace_dynhash_t *hash = dstate->dtds_hash;
dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
processorid_t me = CPU->cpu_id, cpu = me;
dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
size_t bucket, ksize;
size_t chunksize = dstate->dtds_chunksize;
uintptr_t kdata, lock, nstate;
uint_t i;
ASSERT(nkeys != 0);
/*
* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
* algorithm. For the by-value portions, we perform the algorithm in
* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
* bit, and seems to have only a minute effect on distribution. For
* the by-reference data, we perform "One-at-a-time" iterating (safely)
* over each referenced byte. It's painful to do this, but it's much
* better than pathological hash distribution. The efficacy of the
* hashing algorithm (and a comparison with other algorithms) may be
* found by running the ::dtrace_dynstat MDB dcmd.
*/
for (i = 0; i < nkeys; i++) {
if (key[i].dttk_size == 0) {
uint64_t val = key[i].dttk_value;
hashval += (val >> 48) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += (val >> 32) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += (val >> 16) & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
hashval += val & 0xffff;
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
} else {
/*
* This is incredibly painful, but it beats the hell
* out of the alternative.
*/
uint64_t j, size = key[i].dttk_size;
uintptr_t base = (uintptr_t)key[i].dttk_value;
if (!dtrace_canload(base, size, mstate, vstate))
break;
for (j = 0; j < size; j++) {
hashval += dtrace_load8(base + j);
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
}
}
}
if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
return (NULL);
hashval += (hashval << 3);
hashval ^= (hashval >> 11);
hashval += (hashval << 15);
/*
* There is a remote chance (ideally, 1 in 2^31) that our hashval
* comes out to be one of our two sentinel hash values. If this
* actually happens, we set the hashval to be a value known to be a
* non-sentinel value.
*/
if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
hashval = DTRACE_DYNHASH_VALID;
/*
* Yes, it's painful to do a divide here. If the cycle count becomes
* important here, tricks can be pulled to reduce it. (However, it's
* critical that hash collisions be kept to an absolute minimum;
* they're much more painful than a divide.) It's better to have a
* solution that generates few collisions and still keeps things
* relatively simple.
*/
bucket = hashval % dstate->dtds_hashsize;
if (op == DTRACE_DYNVAR_DEALLOC) {
volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
for (;;) {
while ((lock = *lockp) & 1)
continue;
if (dtrace_casptr((void *)lockp,
(void *)lock, (void *)(lock + 1)) == (void *)lock)
break;
}
dtrace_membar_producer();
}
top:
prev = NULL;
lock = hash[bucket].dtdh_lock;
dtrace_membar_consumer();
start = hash[bucket].dtdh_chain;
ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
op != DTRACE_DYNVAR_DEALLOC));
for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
dtrace_key_t *dkey = &dtuple->dtt_key[0];
if (dvar->dtdv_hashval != hashval) {
if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
/*
* We've reached the sink, and therefore the
* end of the hash chain; we can kick out of
* the loop knowing that we have seen a valid
* snapshot of state.
*/
ASSERT(dvar->dtdv_next == NULL);
ASSERT(dvar == &dtrace_dynhash_sink);
break;
}
if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
/*
* We've gone off the rails: somewhere along
* the line, one of the members of this hash
* chain was deleted. Note that we could also
* detect this by simply letting this loop run
* to completion, as we would eventually hit
* the end of the dirty list. However, we
* want to avoid running the length of the
* dirty list unnecessarily (it might be quite
* long), so we catch this as early as
* possible by detecting the hash marker. In
* this case, we simply set dvar to NULL and
* break; the conditional after the loop will
* send us back to top.
*/
dvar = NULL;
break;
}
goto next;
}
if (dtuple->dtt_nkeys != nkeys)
goto next;
for (i = 0; i < nkeys; i++, dkey++) {
if (dkey->dttk_size != key[i].dttk_size)
goto next; /* size or type mismatch */
if (dkey->dttk_size != 0) {
if (dtrace_bcmp(
(void *)(uintptr_t)key[i].dttk_value,
(void *)(uintptr_t)dkey->dttk_value,
dkey->dttk_size))
goto next;
} else {
if (dkey->dttk_value != key[i].dttk_value)
goto next;
}
}
if (op != DTRACE_DYNVAR_DEALLOC)
return (dvar);
ASSERT(dvar->dtdv_next == NULL ||
dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
if (prev != NULL) {
ASSERT(hash[bucket].dtdh_chain != dvar);
ASSERT(start != dvar);
ASSERT(prev->dtdv_next == dvar);
prev->dtdv_next = dvar->dtdv_next;
} else {
if (dtrace_casptr(&hash[bucket].dtdh_chain,
start, dvar->dtdv_next) != start) {
/*
* We have failed to atomically swing the
* hash table head pointer, presumably because
* of a conflicting allocation on another CPU.
* We need to reread the hash chain and try
* again.
*/
goto top;
}
}
dtrace_membar_producer();
/*
* Now set the hash value to indicate that it's free.
*/
ASSERT(hash[bucket].dtdh_chain != dvar);
dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
dtrace_membar_producer();
/*
* Set the next pointer to point at the dirty list, and
* atomically swing the dirty pointer to the newly freed dvar.
*/
do {
next = dcpu->dtdsc_dirty;
dvar->dtdv_next = next;
} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
/*
* Finally, unlock this hash bucket.
*/
ASSERT(hash[bucket].dtdh_lock == lock);
ASSERT(lock & 1);
hash[bucket].dtdh_lock++;
return (NULL);
next:
prev = dvar;
continue;
}
if (dvar == NULL) {
/*
* If dvar is NULL, it is because we went off the rails:
* one of the elements that we traversed in the hash chain
* was deleted while we were traversing it. In this case,
* we assert that we aren't doing a dealloc (deallocs lock
* the hash bucket to prevent themselves from racing with
* one another), and retry the hash chain traversal.
*/
ASSERT(op != DTRACE_DYNVAR_DEALLOC);
goto top;
}
if (op != DTRACE_DYNVAR_ALLOC) {
/*
* If we are not to allocate a new variable, we want to
* return NULL now. Before we return, check that the value
* of the lock word hasn't changed. If it has, we may have
* seen an inconsistent snapshot.
*/
if (op == DTRACE_DYNVAR_NOALLOC) {
if (hash[bucket].dtdh_lock != lock)
goto top;
} else {
ASSERT(op == DTRACE_DYNVAR_DEALLOC);
ASSERT(hash[bucket].dtdh_lock == lock);
ASSERT(lock & 1);
hash[bucket].dtdh_lock++;
}
return (NULL);
}
/*
* We need to allocate a new dynamic variable. The size we need is the
* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
* size of any auxiliary key data (rounded up to 8-byte alignment) plus
* the size of any referred-to data (dsize). We then round the final
* size up to the chunksize for allocation.
*/
for (ksize = 0, i = 0; i < nkeys; i++)
ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
/*
* This should be pretty much impossible, but could happen if, say,
* strange DIF specified the tuple. Ideally, this should be an
* assertion and not an error condition -- but that requires that the
* chunksize calculation in dtrace_difo_chunksize() be absolutely
* bullet-proof. (That is, it must not be able to be fooled by
* malicious DIF.) Given the lack of backwards branches in DIF,
* solving this would presumably not amount to solving the Halting
* Problem -- but it still seems awfully hard.
*/
if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
ksize + dsize > chunksize) {
dcpu->dtdsc_drops++;
return (NULL);
}
nstate = DTRACE_DSTATE_EMPTY;
do {
retry:
free = dcpu->dtdsc_free;
if (free == NULL) {
dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
void *rval;
if (clean == NULL) {
/*
* We're out of dynamic variable space on
* this CPU. Unless we have tried all CPUs,
* we'll try to allocate from a different
* CPU.
*/
switch (dstate->dtds_state) {
case DTRACE_DSTATE_CLEAN: {
void *sp = &dstate->dtds_state;
if (++cpu >= NCPU)
cpu = 0;
if (dcpu->dtdsc_dirty != NULL &&
nstate == DTRACE_DSTATE_EMPTY)
nstate = DTRACE_DSTATE_DIRTY;
if (dcpu->dtdsc_rinsing != NULL)
nstate = DTRACE_DSTATE_RINSING;
dcpu = &dstate->dtds_percpu[cpu];
if (cpu != me)
goto retry;
(void) dtrace_cas32(sp,
DTRACE_DSTATE_CLEAN, nstate);
/*
* To increment the correct bean
* counter, take another lap.
*/
goto retry;
}
case DTRACE_DSTATE_DIRTY:
dcpu->dtdsc_dirty_drops++;
break;
case DTRACE_DSTATE_RINSING:
dcpu->dtdsc_rinsing_drops++;
break;
case DTRACE_DSTATE_EMPTY:
dcpu->dtdsc_drops++;
break;
}
DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
return (NULL);
}
/*
* The clean list appears to be non-empty. We want to
* move the clean list to the free list; we start by
* moving the clean pointer aside.
*/
if (dtrace_casptr(&dcpu->dtdsc_clean,
clean, NULL) != clean) {
/*
* We are in one of two situations:
*
* (a) The clean list was switched to the
* free list by another CPU.
*
* (b) The clean list was added to by the
* cleansing cyclic.
*
* In either of these situations, we can
* just reattempt the free list allocation.
*/
goto retry;
}
ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
/*
* Now we'll move the clean list to our free list.
* It's impossible for this to fail: the only way
* the free list can be updated is through this
* code path, and only one CPU can own the clean list.
* Thus, it would only be possible for this to fail if
* this code were racing with dtrace_dynvar_clean().
* (That is, if dtrace_dynvar_clean() updated the clean
* list, and we ended up racing to update the free
* list.) This race is prevented by the dtrace_sync()
* in dtrace_dynvar_clean() -- which flushes the
* owners of the clean lists out before resetting
* the clean lists.
*/
dcpu = &dstate->dtds_percpu[me];
rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
ASSERT(rval == NULL);
goto retry;
}
dvar = free;
new_free = dvar->dtdv_next;
} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
/*
* We have now allocated a new chunk. We copy the tuple keys into the
* tuple array and copy any referenced key data into the data space
* following the tuple array. As we do this, we relocate dttk_value
* in the final tuple to point to the key data address in the chunk.
*/
kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
dvar->dtdv_data = (void *)(kdata + ksize);
dvar->dtdv_tuple.dtt_nkeys = nkeys;
for (i = 0; i < nkeys; i++) {
dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
size_t kesize = key[i].dttk_size;
if (kesize != 0) {
dtrace_bcopy(
(const void *)(uintptr_t)key[i].dttk_value,
(void *)kdata, kesize);
dkey->dttk_value = kdata;
kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
} else {
dkey->dttk_value = key[i].dttk_value;
}
dkey->dttk_size = kesize;
}
ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
dvar->dtdv_hashval = hashval;
dvar->dtdv_next = start;
if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
return (dvar);
/*
* The cas has failed. Either another CPU is adding an element to
* this hash chain, or another CPU is deleting an element from this
* hash chain. The simplest way to deal with both of these cases
* (though not necessarily the most efficient) is to free our
* allocated block and re-attempt it all. Note that the free is
* to the dirty list and _not_ to the free list. This is to prevent
* races with allocators, above.
*/
dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
dtrace_membar_producer();
do {
free = dcpu->dtdsc_dirty;
dvar->dtdv_next = free;
} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
goto top;
}
/*ARGSUSED*/
static void
dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
{
if ((int64_t)nval < (int64_t)*oval)
*oval = nval;
}
/*ARGSUSED*/
static void
dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
{
if ((int64_t)nval > (int64_t)*oval)
*oval = nval;
}
static void
dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
{
int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
int64_t val = (int64_t)nval;
if (val < 0) {
for (i = 0; i < zero; i++) {
if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
quanta[i] += incr;
return;
}
}
} else {
for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
quanta[i - 1] += incr;
return;
}
}
quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
return;
}
ASSERT(0);
}
static void
dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
{
uint64_t arg = *lquanta++;
int32_t base = DTRACE_LQUANTIZE_BASE(arg);
uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
int32_t val = (int32_t)nval, level;
ASSERT(step != 0);
ASSERT(levels != 0);
if (val < base) {
/*
* This is an underflow.
*/
lquanta[0] += incr;
return;
}
level = (val - base) / step;
if (level < levels) {
lquanta[level + 1] += incr;
return;
}
/*
* This is an overflow.
*/
lquanta[levels + 1] += incr;
}
static int
dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
uint16_t high, uint16_t nsteps, int64_t value)
{
int64_t this = 1, last, next;
int base = 1, order;
ASSERT(factor <= nsteps);
ASSERT(nsteps % factor == 0);
for (order = 0; order < low; order++)
this *= factor;
/*
* If our value is less than our factor taken to the power of the
* low order of magnitude, it goes into the zeroth bucket.
*/
if (value < (last = this))
return (0);
for (this *= factor; order <= high; order++) {
int nbuckets = this > nsteps ? nsteps : this;
if ((next = this * factor) < this) {
/*
* We should not generally get log/linear quantizations
* with a high magnitude that allows 64-bits to
* overflow, but we nonetheless protect against this
* by explicitly checking for overflow, and clamping
* our value accordingly.
*/
value = this - 1;
}
if (value < this) {
/*
* If our value lies within this order of magnitude,
* determine its position by taking the offset within
* the order of magnitude, dividing by the bucket
* width, and adding to our (accumulated) base.
*/
return (base + (value - last) / (this / nbuckets));
}
base += nbuckets - (nbuckets / factor);
last = this;
this = next;
}
/*
* Our value is greater than or equal to our factor taken to the
* power of one plus the high magnitude -- return the top bucket.
*/
return (base);
}
static void
dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
{
uint64_t arg = *llquanta++;
uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
llquanta[dtrace_aggregate_llquantize_bucket(factor,
low, high, nsteps, nval)] += incr;
}
/*ARGSUSED*/
static void
dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
{
data[0]++;
data[1] += nval;
}
/*ARGSUSED*/
static void
dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
{
int64_t snval = (int64_t)nval;
uint64_t tmp[2];
data[0]++;
data[1] += nval;
/*
* What we want to say here is:
*
* data[2] += nval * nval;
*
* But given that nval is 64-bit, we could easily overflow, so
* we do this as 128-bit arithmetic.
*/
if (snval < 0)
snval = -snval;
dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
dtrace_add_128(data + 2, tmp, data + 2);
}
/*ARGSUSED*/
static void
dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
{
*oval = *oval + 1;
}
/*ARGSUSED*/
static void
dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
{
*oval += nval;
}
/*
* Aggregate given the tuple in the principal data buffer, and the aggregating
* action denoted by the specified dtrace_aggregation_t. The aggregation
* buffer is specified as the buf parameter. This routine does not return
* failure; if there is no space in the aggregation buffer, the data will be
* dropped, and a corresponding counter incremented.
*/
static void
dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
{
dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
uint32_t i, ndx, size, fsize;
uint32_t align = sizeof (uint64_t) - 1;
dtrace_aggbuffer_t *agb;
dtrace_aggkey_t *key;
uint32_t hashval = 0, limit, isstr;
caddr_t tomax, data, kdata;
dtrace_actkind_t action;
dtrace_action_t *act;
uintptr_t offs;
if (buf == NULL)
return;
if (!agg->dtag_hasarg) {
/*
* Currently, only quantize() and lquantize() take additional
* arguments, and they have the same semantics: an increment
* value that defaults to 1 when not present. If additional
* aggregating actions take arguments, the setting of the
* default argument value will presumably have to become more
* sophisticated...
*/
arg = 1;
}
action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
size = rec->dtrd_offset - agg->dtag_base;
fsize = size + rec->dtrd_size;
ASSERT(dbuf->dtb_tomax != NULL);
data = dbuf->dtb_tomax + offset + agg->dtag_base;
if ((tomax = buf->dtb_tomax) == NULL) {
dtrace_buffer_drop(buf);
return;
}
/*
* The metastructure is always at the bottom of the buffer.
*/
agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
sizeof (dtrace_aggbuffer_t));
if (buf->dtb_offset == 0) {
/*
* We just kludge up approximately 1/8th of the size to be
* buckets. If this guess ends up being routinely
* off-the-mark, we may need to dynamically readjust this
* based on past performance.
*/
uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
(uintptr_t)tomax || hashsize == 0) {
/*
* We've been given a ludicrously small buffer;
* increment our drop count and leave.
*/
dtrace_buffer_drop(buf);
return;
}
/*
* And now, a pathetic attempt to try to get a an odd (or
* perchance, a prime) hash size for better hash distribution.
*/
if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
hashsize -= DTRACE_AGGHASHSIZE_SLEW;
agb->dtagb_hashsize = hashsize;
agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
for (i = 0; i < agb->dtagb_hashsize; i++)
agb->dtagb_hash[i] = NULL;
}
ASSERT(agg->dtag_first != NULL);
ASSERT(agg->dtag_first->dta_intuple);
/*
* Calculate the hash value based on the key. Note that we _don't_
* include the aggid in the hashing (but we will store it as part of
* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
* algorithm: a simple, quick algorithm that has no known funnels, and
* gets good distribution in practice. The efficacy of the hashing
* algorithm (and a comparison with other algorithms) may be found by
* running the ::dtrace_aggstat MDB dcmd.
*/
for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
isstr = DTRACEACT_ISSTRING(act);
for (; i < limit; i++) {
hashval += data[i];
hashval += (hashval << 10);
hashval ^= (hashval >> 6);
if (isstr && data[i] == '\0')
break;
}
}
hashval += (hashval << 3);
hashval ^= (hashval >> 11);
hashval += (hashval << 15);
/*
* Yes, the divide here is expensive -- but it's generally the least
* of the performance issues given the amount of data that we iterate
* over to compute hash values, compare data, etc.
*/
ndx = hashval % agb->dtagb_hashsize;
for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
ASSERT((caddr_t)key >= tomax);
ASSERT((caddr_t)key < tomax + buf->dtb_size);
if (hashval != key->dtak_hashval || key->dtak_size != size)
continue;
kdata = key->dtak_data;
ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
for (act = agg->dtag_first; act->dta_intuple;
act = act->dta_next) {
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
isstr = DTRACEACT_ISSTRING(act);
for (; i < limit; i++) {
if (kdata[i] != data[i])
goto next;
if (isstr && data[i] == '\0')
break;
}
}
if (action != key->dtak_action) {
/*
* We are aggregating on the same value in the same
* aggregation with two different aggregating actions.
* (This should have been picked up in the compiler,
* so we may be dealing with errant or devious DIF.)
* This is an error condition; we indicate as much,
* and return.
*/
DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
return;
}
/*
* This is a hit: we need to apply the aggregator to
* the value at this key.
*/
agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
return;
next:
continue;
}
/*
* We didn't find it. We need to allocate some zero-filled space,
* link it into the hash table appropriately, and apply the aggregator
* to the (zero-filled) value.
*/
offs = buf->dtb_offset;
while (offs & (align - 1))
offs += sizeof (uint32_t);
/*
* If we don't have enough room to both allocate a new key _and_
* its associated data, increment the drop count and return.
*/
if ((uintptr_t)tomax + offs + fsize >
agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
dtrace_buffer_drop(buf);
return;
}
/*CONSTCOND*/
ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
agb->dtagb_free -= sizeof (dtrace_aggkey_t);
key->dtak_data = kdata = tomax + offs;
buf->dtb_offset = offs + fsize;
/*
* Now copy the data across.
*/
*((dtrace_aggid_t *)kdata) = agg->dtag_id;
for (i = sizeof (dtrace_aggid_t); i < size; i++)
kdata[i] = data[i];
/*
* Because strings are not zeroed out by default, we need to iterate
* looking for actions that store strings, and we need to explicitly
* pad these strings out with zeroes.
*/
for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
int nul;
if (!DTRACEACT_ISSTRING(act))
continue;
i = act->dta_rec.dtrd_offset - agg->dtag_base;
limit = i + act->dta_rec.dtrd_size;
ASSERT(limit <= size);
for (nul = 0; i < limit; i++) {
if (nul) {
kdata[i] = '\0';
continue;
}
if (data[i] != '\0')
continue;
nul = 1;
}
}
for (i = size; i < fsize; i++)
kdata[i] = 0;
key->dtak_hashval = hashval;
key->dtak_size = size;
key->dtak_action = action;
key->dtak_next = agb->dtagb_hash[ndx];
agb->dtagb_hash[ndx] = key;
/*
* Finally, apply the aggregator.
*/
*((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
}
/*
* Given consumer state, this routine finds a speculation in the INACTIVE
* state and transitions it into the ACTIVE state. If there is no speculation
* in the INACTIVE state, 0 is returned. In this case, no error counter is
* incremented -- it is up to the caller to take appropriate action.
*/
static int
dtrace_speculation(dtrace_state_t *state)
{
int i = 0;
dtrace_speculation_state_t current;
uint32_t *stat = &state->dts_speculations_unavail, count;
while (i < state->dts_nspeculations) {
dtrace_speculation_t *spec = &state->dts_speculations[i];
current = spec->dtsp_state;
if (current != DTRACESPEC_INACTIVE) {
if (current == DTRACESPEC_COMMITTINGMANY ||
current == DTRACESPEC_COMMITTING ||
current == DTRACESPEC_DISCARDING)
stat = &state->dts_speculations_busy;
i++;
continue;
}
if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, DTRACESPEC_ACTIVE) == current)
return (i + 1);
}
/*
* We couldn't find a speculation. If we found as much as a single
* busy speculation buffer, we'll attribute this failure as "busy"
* instead of "unavail".
*/
do {
count = *stat;
} while (dtrace_cas32(stat, count, count + 1) != count);
return (0);
}
/*
* This routine commits an active speculation. If the specified speculation
* is not in a valid state to perform a commit(), this routine will silently do
* nothing. The state of the specified speculation is transitioned according
* to the state transition diagram outlined in <sys/dtrace_impl.h>
*/
static void
dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
dtrace_specid_t which)
{
dtrace_speculation_t *spec;
dtrace_buffer_t *src, *dest;
uintptr_t daddr, saddr, dlimit, slimit;
dtrace_speculation_state_t current, new;
intptr_t offs;
uint64_t timestamp;
if (which == 0)
return;
if (which > state->dts_nspeculations) {
cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
return;
}
spec = &state->dts_speculations[which - 1];
src = &spec->dtsp_buffer[cpu];
dest = &state->dts_buffer[cpu];
do {
current = spec->dtsp_state;
if (current == DTRACESPEC_COMMITTINGMANY)
break;
switch (current) {
case DTRACESPEC_INACTIVE:
case DTRACESPEC_DISCARDING:
return;
case DTRACESPEC_COMMITTING:
/*
* This is only possible if we are (a) commit()'ing
* without having done a prior speculate() on this CPU
* and (b) racing with another commit() on a different
* CPU. There's nothing to do -- we just assert that
* our offset is 0.
*/
ASSERT(src->dtb_offset == 0);
return;
case DTRACESPEC_ACTIVE:
new = DTRACESPEC_COMMITTING;
break;
case DTRACESPEC_ACTIVEONE:
/*
* This speculation is active on one CPU. If our
* buffer offset is non-zero, we know that the one CPU
* must be us. Otherwise, we are committing on a
* different CPU from the speculate(), and we must
* rely on being asynchronously cleaned.
*/
if (src->dtb_offset != 0) {
new = DTRACESPEC_COMMITTING;
break;
}
/*FALLTHROUGH*/
case DTRACESPEC_ACTIVEMANY:
new = DTRACESPEC_COMMITTINGMANY;
break;
default:
ASSERT(0);
}
} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
current, new) != current);
/*
* We have set the state to indicate that we are committing this
* speculation. Now reserve the necessary space in the destination
* buffer.
*/
if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
sizeof (uint64_t), state, NULL)) < 0) {
dtrace_buffer_drop(dest);
goto out;
}
/*
* We have sufficient space to copy the speculative buffer into the
* primary buffer. First, modify the speculative buffer, filling
* in the timestamp of all entries with the current time. The data
* must have the commit() time rather than the time it was traced,
* so that all entries in the primary buffer are in timestamp order.
*/
timestamp = dtrace_gethrtime();
saddr = (uintptr_t)src->dtb_tomax;
slimit = saddr + src->dtb_offset;
while (saddr < slimit) {
size_t size;
dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
saddr += sizeof (dtrace_epid_t);
continue;
}
ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
ASSERT3U(saddr + size, <=, slimit);
ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
saddr += size;
}
/*
* Copy the buffer across. (Note that this is a
* highly subobtimal bcopy(); in the unlikely event that this becomes
* a serious performance issue, a high-performance DTrace-specific
* bcopy() should obviously be invented.)