| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. |
| * Copyright (c) 2018, Joyent, Inc. |
| * Copyright (c) 2012, 2014 by Delphix. All rights reserved. |
| */ |
| |
| /* |
| * DTrace - Dynamic Tracing for Solaris |
| * |
| * This is the implementation of the Solaris Dynamic Tracing framework |
| * (DTrace). The user-visible interface to DTrace is described at length in |
| * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace |
| * library, the in-kernel DTrace framework, and the DTrace providers are |
| * described in the block comments in the <sys/dtrace.h> header file. The |
| * internal architecture of DTrace is described in the block comments in the |
| * <sys/dtrace_impl.h> header file. The comments contained within the DTrace |
| * implementation very much assume mastery of all of these sources; if one has |
| * an unanswered question about the implementation, one should consult them |
| * first. |
| * |
| * The functions here are ordered roughly as follows: |
| * |
| * - Probe context functions |
| * - Probe hashing functions |
| * - Non-probe context utility functions |
| * - Matching functions |
| * - Provider-to-Framework API functions |
| * - Probe management functions |
| * - DIF object functions |
| * - Format functions |
| * - Predicate functions |
| * - ECB functions |
| * - Buffer functions |
| * - Enabling functions |
| * - DOF functions |
| * - Anonymous enabling functions |
| * - Consumer state functions |
| * - Helper functions |
| * - Hook functions |
| * - Driver cookbook functions |
| * |
| * Each group of functions begins with a block comment labelled the "DTrace |
| * [Group] Functions", allowing one to find each block by searching forward |
| * on capital-f functions. |
| */ |
| #include <sys/errno.h> |
| #include <sys/stat.h> |
| #include <sys/modctl.h> |
| #include <sys/conf.h> |
| #include <sys/systm.h> |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/cpuvar.h> |
| #include <sys/kmem.h> |
| #include <sys/strsubr.h> |
| #include <sys/sysmacros.h> |
| #include <sys/dtrace_impl.h> |
| #include <sys/atomic.h> |
| #include <sys/cmn_err.h> |
| #include <sys/mutex_impl.h> |
| #include <sys/rwlock_impl.h> |
| #include <sys/ctf_api.h> |
| #include <sys/panic.h> |
| #include <sys/priv_impl.h> |
| #include <sys/policy.h> |
| #include <sys/cred_impl.h> |
| #include <sys/procfs_isa.h> |
| #include <sys/taskq.h> |
| #include <sys/mkdev.h> |
| #include <sys/kdi.h> |
| #include <sys/zone.h> |
| #include <sys/socket.h> |
| #include <netinet/in.h> |
| #include "strtolctype.h" |
| |
| /* |
| * DTrace Tunable Variables |
| * |
| * The following variables may be tuned by adding a line to /etc/system that |
| * includes both the name of the DTrace module ("dtrace") and the name of the |
| * variable. For example: |
| * |
| * set dtrace:dtrace_destructive_disallow = 1 |
| * |
| * In general, the only variables that one should be tuning this way are those |
| * that affect system-wide DTrace behavior, and for which the default behavior |
| * is undesirable. Most of these variables are tunable on a per-consumer |
| * basis using DTrace options, and need not be tuned on a system-wide basis. |
| * When tuning these variables, avoid pathological values; while some attempt |
| * is made to verify the integrity of these variables, they are not considered |
| * part of the supported interface to DTrace, and they are therefore not |
| * checked comprehensively. Further, these variables should not be tuned |
| * dynamically via "mdb -kw" or other means; they should only be tuned via |
| * /etc/system. |
| */ |
| int dtrace_destructive_disallow = 0; |
| dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024); |
| size_t dtrace_difo_maxsize = (256 * 1024); |
| dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024); |
| size_t dtrace_statvar_maxsize = (16 * 1024); |
| size_t dtrace_actions_max = (16 * 1024); |
| size_t dtrace_retain_max = 1024; |
| dtrace_optval_t dtrace_helper_actions_max = 1024; |
| dtrace_optval_t dtrace_helper_providers_max = 32; |
| dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024); |
| size_t dtrace_strsize_default = 256; |
| dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */ |
| dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */ |
| dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */ |
| dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */ |
| dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */ |
| dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */ |
| dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */ |
| dtrace_optval_t dtrace_nspec_default = 1; |
| dtrace_optval_t dtrace_specsize_default = 32 * 1024; |
| dtrace_optval_t dtrace_stackframes_default = 20; |
| dtrace_optval_t dtrace_ustackframes_default = 20; |
| dtrace_optval_t dtrace_jstackframes_default = 50; |
| dtrace_optval_t dtrace_jstackstrsize_default = 512; |
| int dtrace_msgdsize_max = 128; |
| hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */ |
| hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */ |
| int dtrace_devdepth_max = 32; |
| int dtrace_err_verbose; |
| hrtime_t dtrace_deadman_interval = NANOSEC; |
| hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC; |
| hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC; |
| hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC; |
| |
| /* |
| * DTrace External Variables |
| * |
| * As dtrace(7D) is a kernel module, any DTrace variables are obviously |
| * available to DTrace consumers via the backtick (`) syntax. One of these, |
| * dtrace_zero, is made deliberately so: it is provided as a source of |
| * well-known, zero-filled memory. While this variable is not documented, |
| * it is used by some translators as an implementation detail. |
| */ |
| const char dtrace_zero[256] = { 0 }; /* zero-filled memory */ |
| |
| /* |
| * DTrace Internal Variables |
| */ |
| static dev_info_t *dtrace_devi; /* device info */ |
| static vmem_t *dtrace_arena; /* probe ID arena */ |
| static vmem_t *dtrace_minor; /* minor number arena */ |
| static taskq_t *dtrace_taskq; /* task queue */ |
| static dtrace_probe_t **dtrace_probes; /* array of all probes */ |
| static int dtrace_nprobes; /* number of probes */ |
| static dtrace_provider_t *dtrace_provider; /* provider list */ |
| static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */ |
| static int dtrace_opens; /* number of opens */ |
| static int dtrace_helpers; /* number of helpers */ |
| static int dtrace_getf; /* number of unpriv getf()s */ |
| static void *dtrace_softstate; /* softstate pointer */ |
| static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */ |
| static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */ |
| static dtrace_hash_t *dtrace_byname; /* probes hashed by name */ |
| static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */ |
| static int dtrace_toxranges; /* number of toxic ranges */ |
| static int dtrace_toxranges_max; /* size of toxic range array */ |
| static dtrace_anon_t dtrace_anon; /* anonymous enabling */ |
| static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */ |
| static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */ |
| static kthread_t *dtrace_panicked; /* panicking thread */ |
| static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */ |
| static dtrace_genid_t dtrace_probegen; /* current probe generation */ |
| static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */ |
| static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */ |
| static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */ |
| static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */ |
| static int dtrace_dynvar_failclean; /* dynvars failed to clean */ |
| |
| /* |
| * DTrace Locking |
| * DTrace is protected by three (relatively coarse-grained) locks: |
| * |
| * (1) dtrace_lock is required to manipulate essentially any DTrace state, |
| * including enabling state, probes, ECBs, consumer state, helper state, |
| * etc. Importantly, dtrace_lock is _not_ required when in probe context; |
| * probe context is lock-free -- synchronization is handled via the |
| * dtrace_sync() cross call mechanism. |
| * |
| * (2) dtrace_provider_lock is required when manipulating provider state, or |
| * when provider state must be held constant. |
| * |
| * (3) dtrace_meta_lock is required when manipulating meta provider state, or |
| * when meta provider state must be held constant. |
| * |
| * The lock ordering between these three locks is dtrace_meta_lock before |
| * dtrace_provider_lock before dtrace_lock. (In particular, there are |
| * several places where dtrace_provider_lock is held by the framework as it |
| * calls into the providers -- which then call back into the framework, |
| * grabbing dtrace_lock.) |
| * |
| * There are two other locks in the mix: mod_lock and cpu_lock. With respect |
| * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical |
| * role as a coarse-grained lock; it is acquired before both of these locks. |
| * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must |
| * be acquired _between_ dtrace_meta_lock and any other DTrace locks. |
| * mod_lock is similar with respect to dtrace_provider_lock in that it must be |
| * acquired _between_ dtrace_provider_lock and dtrace_lock. |
| */ |
| static kmutex_t dtrace_lock; /* probe state lock */ |
| static kmutex_t dtrace_provider_lock; /* provider state lock */ |
| static kmutex_t dtrace_meta_lock; /* meta-provider state lock */ |
| |
| /* |
| * DTrace Provider Variables |
| * |
| * These are the variables relating to DTrace as a provider (that is, the |
| * provider of the BEGIN, END, and ERROR probes). |
| */ |
| static dtrace_pattr_t dtrace_provider_attr = { |
| { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, |
| { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, |
| { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, |
| { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, |
| { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON }, |
| }; |
| |
| static void |
| dtrace_nullop(void) |
| {} |
| |
| static int |
| dtrace_enable_nullop(void) |
| { |
| return (0); |
| } |
| |
| static dtrace_pops_t dtrace_provider_ops = { |
| (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, |
| (void (*)(void *, struct modctl *))dtrace_nullop, |
| (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, |
| (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, |
| (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, |
| (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, |
| NULL, |
| NULL, |
| NULL, |
| (void (*)(void *, dtrace_id_t, void *))dtrace_nullop |
| }; |
| |
| static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */ |
| static dtrace_id_t dtrace_probeid_end; /* special END probe */ |
| dtrace_id_t dtrace_probeid_error; /* special ERROR probe */ |
| |
| /* |
| * DTrace Helper Tracing Variables |
| * |
| * These variables should be set dynamically to enable helper tracing. The |
| * only variables that should be set are dtrace_helptrace_enable (which should |
| * be set to a non-zero value to allocate helper tracing buffers on the next |
| * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a |
| * non-zero value to deallocate helper tracing buffers on the next close of |
| * /dev/dtrace). When (and only when) helper tracing is disabled, the |
| * buffer size may also be set via dtrace_helptrace_bufsize. |
| */ |
| int dtrace_helptrace_enable = 0; |
| int dtrace_helptrace_disable = 0; |
| int dtrace_helptrace_bufsize = 16 * 1024 * 1024; |
| uint32_t dtrace_helptrace_nlocals; |
| static dtrace_helptrace_t *dtrace_helptrace_buffer; |
| static uint32_t dtrace_helptrace_next = 0; |
| static int dtrace_helptrace_wrapped = 0; |
| |
| /* |
| * DTrace Error Hashing |
| * |
| * On DEBUG kernels, DTrace will track the errors that has seen in a hash |
| * table. This is very useful for checking coverage of tests that are |
| * expected to induce DIF or DOF processing errors, and may be useful for |
| * debugging problems in the DIF code generator or in DOF generation . The |
| * error hash may be examined with the ::dtrace_errhash MDB dcmd. |
| */ |
| #ifdef DEBUG |
| static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ]; |
| static const char *dtrace_errlast; |
| static kthread_t *dtrace_errthread; |
| static kmutex_t dtrace_errlock; |
| #endif |
| |
| /* |
| * DTrace Macros and Constants |
| * |
| * These are various macros that are useful in various spots in the |
| * implementation, along with a few random constants that have no meaning |
| * outside of the implementation. There is no real structure to this cpp |
| * mishmash -- but is there ever? |
| */ |
| #define DTRACE_HASHSTR(hash, probe) \ |
| dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs))) |
| |
| #define DTRACE_HASHNEXT(hash, probe) \ |
| (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs) |
| |
| #define DTRACE_HASHPREV(hash, probe) \ |
| (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs) |
| |
| #define DTRACE_HASHEQ(hash, lhs, rhs) \ |
| (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \ |
| *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0) |
| |
| #define DTRACE_AGGHASHSIZE_SLEW 17 |
| |
| #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3) |
| |
| /* |
| * The key for a thread-local variable consists of the lower 61 bits of the |
| * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL. |
| * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never |
| * equal to a variable identifier. This is necessary (but not sufficient) to |
| * assure that global associative arrays never collide with thread-local |
| * variables. To guarantee that they cannot collide, we must also define the |
| * order for keying dynamic variables. That order is: |
| * |
| * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ] |
| * |
| * Because the variable-key and the tls-key are in orthogonal spaces, there is |
| * no way for a global variable key signature to match a thread-local key |
| * signature. |
| */ |
| #define DTRACE_TLS_THRKEY(where) { \ |
| uint_t intr = 0; \ |
| uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \ |
| for (; actv; actv >>= 1) \ |
| intr++; \ |
| ASSERT(intr < (1 << 3)); \ |
| (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \ |
| (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \ |
| } |
| |
| #define DT_BSWAP_8(x) ((x) & 0xff) |
| #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8)) |
| #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16)) |
| #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32)) |
| |
| #define DT_MASK_LO 0x00000000FFFFFFFFULL |
| |
| #define DTRACE_STORE(type, tomax, offset, what) \ |
| *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what); |
| |
| #ifndef __x86 |
| #define DTRACE_ALIGNCHECK(addr, size, flags) \ |
| if (addr & (size - 1)) { \ |
| *flags |= CPU_DTRACE_BADALIGN; \ |
| cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \ |
| return (0); \ |
| } |
| #else |
| #define DTRACE_ALIGNCHECK(addr, size, flags) |
| #endif |
| |
| /* |
| * Test whether a range of memory starting at testaddr of size testsz falls |
| * within the range of memory described by addr, sz. We take care to avoid |
| * problems with overflow and underflow of the unsigned quantities, and |
| * disallow all negative sizes. Ranges of size 0 are allowed. |
| */ |
| #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \ |
| ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \ |
| (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \ |
| (testaddr) + (testsz) >= (testaddr)) |
| |
| #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \ |
| do { \ |
| if ((remp) != NULL) { \ |
| *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \ |
| } \ |
| _NOTE(CONSTCOND) } while (0) |
| |
| |
| /* |
| * Test whether alloc_sz bytes will fit in the scratch region. We isolate |
| * alloc_sz on the righthand side of the comparison in order to avoid overflow |
| * or underflow in the comparison with it. This is simpler than the INRANGE |
| * check above, because we know that the dtms_scratch_ptr is valid in the |
| * range. Allocations of size zero are allowed. |
| */ |
| #define DTRACE_INSCRATCH(mstate, alloc_sz) \ |
| ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \ |
| (mstate)->dtms_scratch_ptr >= (alloc_sz)) |
| |
| #define DTRACE_LOADFUNC(bits) \ |
| /*CSTYLED*/ \ |
| uint##bits##_t \ |
| dtrace_load##bits(uintptr_t addr) \ |
| { \ |
| size_t size = bits / NBBY; \ |
| /*CSTYLED*/ \ |
| uint##bits##_t rval; \ |
| int i; \ |
| volatile uint16_t *flags = (volatile uint16_t *) \ |
| &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \ |
| \ |
| DTRACE_ALIGNCHECK(addr, size, flags); \ |
| \ |
| for (i = 0; i < dtrace_toxranges; i++) { \ |
| if (addr >= dtrace_toxrange[i].dtt_limit) \ |
| continue; \ |
| \ |
| if (addr + size <= dtrace_toxrange[i].dtt_base) \ |
| continue; \ |
| \ |
| /* \ |
| * This address falls within a toxic region; return 0. \ |
| */ \ |
| *flags |= CPU_DTRACE_BADADDR; \ |
| cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \ |
| return (0); \ |
| } \ |
| \ |
| *flags |= CPU_DTRACE_NOFAULT; \ |
| /*CSTYLED*/ \ |
| rval = *((volatile uint##bits##_t *)addr); \ |
| *flags &= ~CPU_DTRACE_NOFAULT; \ |
| \ |
| return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \ |
| } |
| |
| #ifdef _LP64 |
| #define dtrace_loadptr dtrace_load64 |
| #else |
| #define dtrace_loadptr dtrace_load32 |
| #endif |
| |
| #define DTRACE_DYNHASH_FREE 0 |
| #define DTRACE_DYNHASH_SINK 1 |
| #define DTRACE_DYNHASH_VALID 2 |
| |
| #define DTRACE_MATCH_FAIL -1 |
| #define DTRACE_MATCH_NEXT 0 |
| #define DTRACE_MATCH_DONE 1 |
| #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') |
| #define DTRACE_STATE_ALIGN 64 |
| |
| #define DTRACE_FLAGS2FLT(flags) \ |
| (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \ |
| ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \ |
| ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \ |
| ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \ |
| ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \ |
| ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \ |
| ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \ |
| ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \ |
| ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \ |
| DTRACEFLT_UNKNOWN) |
| |
| #define DTRACEACT_ISSTRING(act) \ |
| ((act)->dta_kind == DTRACEACT_DIFEXPR && \ |
| (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) |
| |
| static size_t dtrace_strlen(const char *, size_t); |
| static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id); |
| static void dtrace_enabling_provide(dtrace_provider_t *); |
| static int dtrace_enabling_match(dtrace_enabling_t *, int *); |
| static void dtrace_enabling_matchall(void); |
| static void dtrace_enabling_reap(void); |
| static dtrace_state_t *dtrace_anon_grab(void); |
| static uint64_t dtrace_helper(int, dtrace_mstate_t *, |
| dtrace_state_t *, uint64_t, uint64_t); |
| static dtrace_helpers_t *dtrace_helpers_create(proc_t *); |
| static void dtrace_buffer_drop(dtrace_buffer_t *); |
| static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when); |
| static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t, |
| dtrace_state_t *, dtrace_mstate_t *); |
| static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t, |
| dtrace_optval_t); |
| static int dtrace_ecb_create_enable(dtrace_probe_t *, void *); |
| static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *); |
| static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *); |
| static void dtrace_getf_barrier(void); |
| static int dtrace_canload_remains(uint64_t, size_t, size_t *, |
| dtrace_mstate_t *, dtrace_vstate_t *); |
| static int dtrace_canstore_remains(uint64_t, size_t, size_t *, |
| dtrace_mstate_t *, dtrace_vstate_t *); |
| |
| /* |
| * DTrace Probe Context Functions |
| * |
| * These functions are called from probe context. Because probe context is |
| * any context in which C may be called, arbitrarily locks may be held, |
| * interrupts may be disabled, we may be in arbitrary dispatched state, etc. |
| * As a result, functions called from probe context may only call other DTrace |
| * support functions -- they may not interact at all with the system at large. |
| * (Note that the ASSERT macro is made probe-context safe by redefining it in |
| * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary |
| * loads are to be performed from probe context, they _must_ be in terms of |
| * the safe dtrace_load*() variants. |
| * |
| * Some functions in this block are not actually called from probe context; |
| * for these functions, there will be a comment above the function reading |
| * "Note: not called from probe context." |
| */ |
| void |
| dtrace_panic(const char *format, ...) |
| { |
| va_list alist; |
| |
| va_start(alist, format); |
| dtrace_vpanic(format, alist); |
| va_end(alist); |
| } |
| |
| int |
| dtrace_assfail(const char *a, const char *f, int l) |
| { |
| dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l); |
| |
| /* |
| * We just need something here that even the most clever compiler |
| * cannot optimize away. |
| */ |
| return (a[(uintptr_t)f]); |
| } |
| |
| /* |
| * Atomically increment a specified error counter from probe context. |
| */ |
| static void |
| dtrace_error(uint32_t *counter) |
| { |
| /* |
| * Most counters stored to in probe context are per-CPU counters. |
| * However, there are some error conditions that are sufficiently |
| * arcane that they don't merit per-CPU storage. If these counters |
| * are incremented concurrently on different CPUs, scalability will be |
| * adversely affected -- but we don't expect them to be white-hot in a |
| * correctly constructed enabling... |
| */ |
| uint32_t oval, nval; |
| |
| do { |
| oval = *counter; |
| |
| if ((nval = oval + 1) == 0) { |
| /* |
| * If the counter would wrap, set it to 1 -- assuring |
| * that the counter is never zero when we have seen |
| * errors. (The counter must be 32-bits because we |
| * aren't guaranteed a 64-bit compare&swap operation.) |
| * To save this code both the infamy of being fingered |
| * by a priggish news story and the indignity of being |
| * the target of a neo-puritan witch trial, we're |
| * carefully avoiding any colorful description of the |
| * likelihood of this condition -- but suffice it to |
| * say that it is only slightly more likely than the |
| * overflow of predicate cache IDs, as discussed in |
| * dtrace_predicate_create(). |
| */ |
| nval = 1; |
| } |
| } while (dtrace_cas32(counter, oval, nval) != oval); |
| } |
| |
| /* |
| * Use the DTRACE_LOADFUNC macro to define functions for each of loading a |
| * uint8_t, a uint16_t, a uint32_t and a uint64_t. |
| */ |
| /* BEGIN CSTYLED */ |
| DTRACE_LOADFUNC(8) |
| DTRACE_LOADFUNC(16) |
| DTRACE_LOADFUNC(32) |
| DTRACE_LOADFUNC(64) |
| /* END CSTYLED */ |
| |
| static int |
| dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate) |
| { |
| if (dest < mstate->dtms_scratch_base) |
| return (0); |
| |
| if (dest + size < dest) |
| return (0); |
| |
| if (dest + size > mstate->dtms_scratch_ptr) |
| return (0); |
| |
| return (1); |
| } |
| |
| static int |
| dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain, |
| dtrace_statvar_t **svars, int nsvars) |
| { |
| int i; |
| size_t maxglobalsize, maxlocalsize; |
| |
| if (nsvars == 0) |
| return (0); |
| |
| maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t); |
| maxlocalsize = maxglobalsize * NCPU; |
| |
| for (i = 0; i < nsvars; i++) { |
| dtrace_statvar_t *svar = svars[i]; |
| uint8_t scope; |
| size_t size; |
| |
| if (svar == NULL || (size = svar->dtsv_size) == 0) |
| continue; |
| |
| scope = svar->dtsv_var.dtdv_scope; |
| |
| /* |
| * We verify that our size is valid in the spirit of providing |
| * defense in depth: we want to prevent attackers from using |
| * DTrace to escalate an orthogonal kernel heap corruption bug |
| * into the ability to store to arbitrary locations in memory. |
| */ |
| VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) || |
| (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize)); |
| |
| if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, |
| svar->dtsv_size)) { |
| DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data, |
| svar->dtsv_size); |
| return (1); |
| } |
| } |
| |
| return (0); |
| } |
| |
| /* |
| * Check to see if the address is within a memory region to which a store may |
| * be issued. This includes the DTrace scratch areas, and any DTrace variable |
| * region. The caller of dtrace_canstore() is responsible for performing any |
| * alignment checks that are needed before stores are actually executed. |
| */ |
| static int |
| dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, |
| dtrace_vstate_t *vstate) |
| { |
| return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate)); |
| } |
| |
| /* |
| * Implementation of dtrace_canstore which communicates the upper bound of the |
| * allowed memory region. |
| */ |
| static int |
| dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain, |
| dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) |
| { |
| /* |
| * First, check to see if the address is in scratch space... |
| */ |
| if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base, |
| mstate->dtms_scratch_size)) { |
| DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base, |
| mstate->dtms_scratch_size); |
| return (1); |
| } |
| |
| /* |
| * Now check to see if it's a dynamic variable. This check will pick |
| * up both thread-local variables and any global dynamically-allocated |
| * variables. |
| */ |
| if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base, |
| vstate->dtvs_dynvars.dtds_size)) { |
| dtrace_dstate_t *dstate = &vstate->dtvs_dynvars; |
| uintptr_t base = (uintptr_t)dstate->dtds_base + |
| (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t)); |
| uintptr_t chunkoffs; |
| dtrace_dynvar_t *dvar; |
| |
| /* |
| * Before we assume that we can store here, we need to make |
| * sure that it isn't in our metadata -- storing to our |
| * dynamic variable metadata would corrupt our state. For |
| * the range to not include any dynamic variable metadata, |
| * it must: |
| * |
| * (1) Start above the hash table that is at the base of |
| * the dynamic variable space |
| * |
| * (2) Have a starting chunk offset that is beyond the |
| * dtrace_dynvar_t that is at the base of every chunk |
| * |
| * (3) Not span a chunk boundary |
| * |
| * (4) Not be in the tuple space of a dynamic variable |
| * |
| */ |
| if (addr < base) |
| return (0); |
| |
| chunkoffs = (addr - base) % dstate->dtds_chunksize; |
| |
| if (chunkoffs < sizeof (dtrace_dynvar_t)) |
| return (0); |
| |
| if (chunkoffs + sz > dstate->dtds_chunksize) |
| return (0); |
| |
| dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs); |
| |
| if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) |
| return (0); |
| |
| if (chunkoffs < sizeof (dtrace_dynvar_t) + |
| ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t))) |
| return (0); |
| |
| DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize); |
| return (1); |
| } |
| |
| /* |
| * Finally, check the static local and global variables. These checks |
| * take the longest, so we perform them last. |
| */ |
| if (dtrace_canstore_statvar(addr, sz, remain, |
| vstate->dtvs_locals, vstate->dtvs_nlocals)) |
| return (1); |
| |
| if (dtrace_canstore_statvar(addr, sz, remain, |
| vstate->dtvs_globals, vstate->dtvs_nglobals)) |
| return (1); |
| |
| return (0); |
| } |
| |
| |
| /* |
| * Convenience routine to check to see if the address is within a memory |
| * region in which a load may be issued given the user's privilege level; |
| * if not, it sets the appropriate error flags and loads 'addr' into the |
| * illegal value slot. |
| * |
| * DTrace subroutines (DIF_SUBR_*) should use this helper to implement |
| * appropriate memory access protection. |
| */ |
| static int |
| dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, |
| dtrace_vstate_t *vstate) |
| { |
| return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate)); |
| } |
| |
| /* |
| * Implementation of dtrace_canload which communicates the upper bound of the |
| * allowed memory region. |
| */ |
| static int |
| dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain, |
| dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) |
| { |
| volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval; |
| file_t *fp; |
| |
| /* |
| * If we hold the privilege to read from kernel memory, then |
| * everything is readable. |
| */ |
| if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { |
| DTRACE_RANGE_REMAIN(remain, addr, addr, sz); |
| return (1); |
| } |
| |
| /* |
| * You can obviously read that which you can store. |
| */ |
| if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate)) |
| return (1); |
| |
| /* |
| * We're allowed to read from our own string table. |
| */ |
| if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab, |
| mstate->dtms_difo->dtdo_strlen)) { |
| DTRACE_RANGE_REMAIN(remain, addr, |
| mstate->dtms_difo->dtdo_strtab, |
| mstate->dtms_difo->dtdo_strlen); |
| return (1); |
| } |
| |
| if (vstate->dtvs_state != NULL && |
| dtrace_priv_proc(vstate->dtvs_state, mstate)) { |
| proc_t *p; |
| |
| /* |
| * When we have privileges to the current process, there are |
| * several context-related kernel structures that are safe to |
| * read, even absent the privilege to read from kernel memory. |
| * These reads are safe because these structures contain only |
| * state that (1) we're permitted to read, (2) is harmless or |
| * (3) contains pointers to additional kernel state that we're |
| * not permitted to read (and as such, do not present an |
| * opportunity for privilege escalation). Finally (and |
| * critically), because of the nature of their relation with |
| * the current thread context, the memory associated with these |
| * structures cannot change over the duration of probe context, |
| * and it is therefore impossible for this memory to be |
| * deallocated and reallocated as something else while it's |
| * being operated upon. |
| */ |
| if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) { |
| DTRACE_RANGE_REMAIN(remain, addr, curthread, |
| sizeof (kthread_t)); |
| return (1); |
| } |
| |
| if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr, |
| sz, curthread->t_procp, sizeof (proc_t))) { |
| DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp, |
| sizeof (proc_t)); |
| return (1); |
| } |
| |
| if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz, |
| curthread->t_cred, sizeof (cred_t))) { |
| DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred, |
| sizeof (cred_t)); |
| return (1); |
| } |
| |
| if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz, |
| &(p->p_pidp->pid_id), sizeof (pid_t))) { |
| DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id), |
| sizeof (pid_t)); |
| return (1); |
| } |
| |
| if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz, |
| curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) { |
| DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu, |
| offsetof(cpu_t, cpu_pause_thread)); |
| return (1); |
| } |
| } |
| |
| if ((fp = mstate->dtms_getf) != NULL) { |
| uintptr_t psz = sizeof (void *); |
| vnode_t *vp; |
| vnodeops_t *op; |
| |
| /* |
| * When getf() returns a file_t, the enabling is implicitly |
| * granted the (transient) right to read the returned file_t |
| * as well as the v_path and v_op->vnop_name of the underlying |
| * vnode. These accesses are allowed after a successful |
| * getf() because the members that they refer to cannot change |
| * once set -- and the barrier logic in the kernel's closef() |
| * path assures that the file_t and its referenced vode_t |
| * cannot themselves be stale (that is, it impossible for |
| * either dtms_getf itself or its f_vnode member to reference |
| * freed memory). |
| */ |
| if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) { |
| DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t)); |
| return (1); |
| } |
| |
| if ((vp = fp->f_vnode) != NULL) { |
| size_t slen; |
| |
| if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) { |
| DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path, |
| psz); |
| return (1); |
| } |
| |
| slen = strlen(vp->v_path) + 1; |
| if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) { |
| DTRACE_RANGE_REMAIN(remain, addr, vp->v_path, |
| slen); |
| return (1); |
| } |
| |
| if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) { |
| DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op, |
| psz); |
| return (1); |
| } |
| |
| if ((op = vp->v_op) != NULL && |
| DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) { |
| DTRACE_RANGE_REMAIN(remain, addr, |
| &op->vnop_name, psz); |
| return (1); |
| } |
| |
| if (op != NULL && op->vnop_name != NULL && |
| DTRACE_INRANGE(addr, sz, op->vnop_name, |
| (slen = strlen(op->vnop_name) + 1))) { |
| DTRACE_RANGE_REMAIN(remain, addr, |
| op->vnop_name, slen); |
| return (1); |
| } |
| } |
| } |
| |
| DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV); |
| *illval = addr; |
| return (0); |
| } |
| |
| /* |
| * Convenience routine to check to see if a given string is within a memory |
| * region in which a load may be issued given the user's privilege level; |
| * this exists so that we don't need to issue unnecessary dtrace_strlen() |
| * calls in the event that the user has all privileges. |
| */ |
| static int |
| dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain, |
| dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) |
| { |
| size_t rsize; |
| |
| /* |
| * If we hold the privilege to read from kernel memory, then |
| * everything is readable. |
| */ |
| if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { |
| DTRACE_RANGE_REMAIN(remain, addr, addr, sz); |
| return (1); |
| } |
| |
| /* |
| * Even if the caller is uninterested in querying the remaining valid |
| * range, it is required to ensure that the access is allowed. |
| */ |
| if (remain == NULL) { |
| remain = &rsize; |
| } |
| if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) { |
| size_t strsz; |
| /* |
| * Perform the strlen after determining the length of the |
| * memory region which is accessible. This prevents timing |
| * information from being used to find NULs in memory which is |
| * not accessible to the caller. |
| */ |
| strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, |
| MIN(sz, *remain)); |
| if (strsz <= *remain) { |
| return (1); |
| } |
| } |
| |
| return (0); |
| } |
| |
| /* |
| * Convenience routine to check to see if a given variable is within a memory |
| * region in which a load may be issued given the user's privilege level. |
| */ |
| static int |
| dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain, |
| dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) |
| { |
| size_t sz; |
| ASSERT(type->dtdt_flags & DIF_TF_BYREF); |
| |
| /* |
| * Calculate the max size before performing any checks since even |
| * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function |
| * return the max length via 'remain'. |
| */ |
| if (type->dtdt_kind == DIF_TYPE_STRING) { |
| dtrace_state_t *state = vstate->dtvs_state; |
| |
| if (state != NULL) { |
| sz = state->dts_options[DTRACEOPT_STRSIZE]; |
| } else { |
| /* |
| * In helper context, we have a NULL state; fall back |
| * to using the system-wide default for the string size |
| * in this case. |
| */ |
| sz = dtrace_strsize_default; |
| } |
| } else { |
| sz = type->dtdt_size; |
| } |
| |
| /* |
| * If we hold the privilege to read from kernel memory, then |
| * everything is readable. |
| */ |
| if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) { |
| DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz); |
| return (1); |
| } |
| |
| if (type->dtdt_kind == DIF_TYPE_STRING) { |
| return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate, |
| vstate)); |
| } |
| return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate, |
| vstate)); |
| } |
| |
| /* |
| * Convert a string to a signed integer using safe loads. |
| * |
| * NOTE: This function uses various macros from strtolctype.h to manipulate |
| * digit values, etc -- these have all been checked to ensure they make |
| * no additional function calls. |
| */ |
| static int64_t |
| dtrace_strtoll(char *input, int base, size_t limit) |
| { |
| uintptr_t pos = (uintptr_t)input; |
| int64_t val = 0; |
| int x; |
| boolean_t neg = B_FALSE; |
| char c, cc, ccc; |
| uintptr_t end = pos + limit; |
| |
| /* |
| * Consume any whitespace preceding digits. |
| */ |
| while ((c = dtrace_load8(pos)) == ' ' || c == '\t') |
| pos++; |
| |
| /* |
| * Handle an explicit sign if one is present. |
| */ |
| if (c == '-' || c == '+') { |
| if (c == '-') |
| neg = B_TRUE; |
| c = dtrace_load8(++pos); |
| } |
| |
| /* |
| * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it |
| * if present. |
| */ |
| if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' || |
| cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) { |
| pos += 2; |
| c = ccc; |
| } |
| |
| /* |
| * Read in contiguous digits until the first non-digit character. |
| */ |
| for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base; |
| c = dtrace_load8(++pos)) |
| val = val * base + x; |
| |
| return (neg ? -val : val); |
| } |
| |
| /* |
| * Compare two strings using safe loads. |
| */ |
| static int |
| dtrace_strncmp(char *s1, char *s2, size_t limit) |
| { |
| uint8_t c1, c2; |
| volatile uint16_t *flags; |
| |
| if (s1 == s2 || limit == 0) |
| return (0); |
| |
| flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; |
| |
| do { |
| if (s1 == NULL) { |
| c1 = '\0'; |
| } else { |
| c1 = dtrace_load8((uintptr_t)s1++); |
| } |
| |
| if (s2 == NULL) { |
| c2 = '\0'; |
| } else { |
| c2 = dtrace_load8((uintptr_t)s2++); |
| } |
| |
| if (c1 != c2) |
| return (c1 - c2); |
| } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT)); |
| |
| return (0); |
| } |
| |
| /* |
| * Compute strlen(s) for a string using safe memory accesses. The additional |
| * len parameter is used to specify a maximum length to ensure completion. |
| */ |
| static size_t |
| dtrace_strlen(const char *s, size_t lim) |
| { |
| uint_t len; |
| |
| for (len = 0; len != lim; len++) { |
| if (dtrace_load8((uintptr_t)s++) == '\0') |
| break; |
| } |
| |
| return (len); |
| } |
| |
| /* |
| * Check if an address falls within a toxic region. |
| */ |
| static int |
| dtrace_istoxic(uintptr_t kaddr, size_t size) |
| { |
| uintptr_t taddr, tsize; |
| int i; |
| |
| for (i = 0; i < dtrace_toxranges; i++) { |
| taddr = dtrace_toxrange[i].dtt_base; |
| tsize = dtrace_toxrange[i].dtt_limit - taddr; |
| |
| if (kaddr - taddr < tsize) { |
| DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); |
| cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr; |
| return (1); |
| } |
| |
| if (taddr - kaddr < size) { |
| DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR); |
| cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr; |
| return (1); |
| } |
| } |
| |
| return (0); |
| } |
| |
| /* |
| * Copy src to dst using safe memory accesses. The src is assumed to be unsafe |
| * memory specified by the DIF program. The dst is assumed to be safe memory |
| * that we can store to directly because it is managed by DTrace. As with |
| * standard bcopy, overlapping copies are handled properly. |
| */ |
| static void |
| dtrace_bcopy(const void *src, void *dst, size_t len) |
| { |
| if (len != 0) { |
| uint8_t *s1 = dst; |
| const uint8_t *s2 = src; |
| |
| if (s1 <= s2) { |
| do { |
| *s1++ = dtrace_load8((uintptr_t)s2++); |
| } while (--len != 0); |
| } else { |
| s2 += len; |
| s1 += len; |
| |
| do { |
| *--s1 = dtrace_load8((uintptr_t)--s2); |
| } while (--len != 0); |
| } |
| } |
| } |
| |
| /* |
| * Copy src to dst using safe memory accesses, up to either the specified |
| * length, or the point that a nul byte is encountered. The src is assumed to |
| * be unsafe memory specified by the DIF program. The dst is assumed to be |
| * safe memory that we can store to directly because it is managed by DTrace. |
| * Unlike dtrace_bcopy(), overlapping regions are not handled. |
| */ |
| static void |
| dtrace_strcpy(const void *src, void *dst, size_t len) |
| { |
| if (len != 0) { |
| uint8_t *s1 = dst, c; |
| const uint8_t *s2 = src; |
| |
| do { |
| *s1++ = c = dtrace_load8((uintptr_t)s2++); |
| } while (--len != 0 && c != '\0'); |
| } |
| } |
| |
| /* |
| * Copy src to dst, deriving the size and type from the specified (BYREF) |
| * variable type. The src is assumed to be unsafe memory specified by the DIF |
| * program. The dst is assumed to be DTrace variable memory that is of the |
| * specified type; we assume that we can store to directly. |
| */ |
| static void |
| dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit) |
| { |
| ASSERT(type->dtdt_flags & DIF_TF_BYREF); |
| |
| if (type->dtdt_kind == DIF_TYPE_STRING) { |
| dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit)); |
| } else { |
| dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit)); |
| } |
| } |
| |
| /* |
| * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be |
| * unsafe memory specified by the DIF program. The s2 data is assumed to be |
| * safe memory that we can access directly because it is managed by DTrace. |
| */ |
| static int |
| dtrace_bcmp(const void *s1, const void *s2, size_t len) |
| { |
| volatile uint16_t *flags; |
| |
| flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags; |
| |
| if (s1 == s2) |
| return (0); |
| |
| if (s1 == NULL || s2 == NULL) |
| return (1); |
| |
| if (s1 != s2 && len != 0) { |
| const uint8_t *ps1 = s1; |
| const uint8_t *ps2 = s2; |
| |
| do { |
| if (dtrace_load8((uintptr_t)ps1++) != *ps2++) |
| return (1); |
| } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT)); |
| } |
| return (0); |
| } |
| |
| /* |
| * Zero the specified region using a simple byte-by-byte loop. Note that this |
| * is for safe DTrace-managed memory only. |
| */ |
| static void |
| dtrace_bzero(void *dst, size_t len) |
| { |
| uchar_t *cp; |
| |
| for (cp = dst; len != 0; len--) |
| *cp++ = 0; |
| } |
| |
| static void |
| dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum) |
| { |
| uint64_t result[2]; |
| |
| result[0] = addend1[0] + addend2[0]; |
| result[1] = addend1[1] + addend2[1] + |
| (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0); |
| |
| sum[0] = result[0]; |
| sum[1] = result[1]; |
| } |
| |
| /* |
| * Shift the 128-bit value in a by b. If b is positive, shift left. |
| * If b is negative, shift right. |
| */ |
| static void |
| dtrace_shift_128(uint64_t *a, int b) |
| { |
| uint64_t mask; |
| |
| if (b == 0) |
| return; |
| |
| if (b < 0) { |
| b = -b; |
| if (b >= 64) { |
| a[0] = a[1] >> (b - 64); |
| a[1] = 0; |
| } else { |
| a[0] >>= b; |
| mask = 1LL << (64 - b); |
| mask -= 1; |
| a[0] |= ((a[1] & mask) << (64 - b)); |
| a[1] >>= b; |
| } |
| } else { |
| if (b >= 64) { |
| a[1] = a[0] << (b - 64); |
| a[0] = 0; |
| } else { |
| a[1] <<= b; |
| mask = a[0] >> (64 - b); |
| a[1] |= mask; |
| a[0] <<= b; |
| } |
| } |
| } |
| |
| /* |
| * The basic idea is to break the 2 64-bit values into 4 32-bit values, |
| * use native multiplication on those, and then re-combine into the |
| * resulting 128-bit value. |
| * |
| * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) = |
| * hi1 * hi2 << 64 + |
| * hi1 * lo2 << 32 + |
| * hi2 * lo1 << 32 + |
| * lo1 * lo2 |
| */ |
| static void |
| dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product) |
| { |
| uint64_t hi1, hi2, lo1, lo2; |
| uint64_t tmp[2]; |
| |
| hi1 = factor1 >> 32; |
| hi2 = factor2 >> 32; |
| |
| lo1 = factor1 & DT_MASK_LO; |
| lo2 = factor2 & DT_MASK_LO; |
| |
| product[0] = lo1 * lo2; |
| product[1] = hi1 * hi2; |
| |
| tmp[0] = hi1 * lo2; |
| tmp[1] = 0; |
| dtrace_shift_128(tmp, 32); |
| dtrace_add_128(product, tmp, product); |
| |
| tmp[0] = hi2 * lo1; |
| tmp[1] = 0; |
| dtrace_shift_128(tmp, 32); |
| dtrace_add_128(product, tmp, product); |
| } |
| |
| /* |
| * This privilege check should be used by actions and subroutines to |
| * verify that the user credentials of the process that enabled the |
| * invoking ECB match the target credentials |
| */ |
| static int |
| dtrace_priv_proc_common_user(dtrace_state_t *state) |
| { |
| cred_t *cr, *s_cr = state->dts_cred.dcr_cred; |
| |
| /* |
| * We should always have a non-NULL state cred here, since if cred |
| * is null (anonymous tracing), we fast-path bypass this routine. |
| */ |
| ASSERT(s_cr != NULL); |
| |
| if ((cr = CRED()) != NULL && |
| s_cr->cr_uid == cr->cr_uid && |
| s_cr->cr_uid == cr->cr_ruid && |
| s_cr->cr_uid == cr->cr_suid && |
| s_cr->cr_gid == cr->cr_gid && |
| s_cr->cr_gid == cr->cr_rgid && |
| s_cr->cr_gid == cr->cr_sgid) |
| return (1); |
| |
| return (0); |
| } |
| |
| /* |
| * This privilege check should be used by actions and subroutines to |
| * verify that the zone of the process that enabled the invoking ECB |
| * matches the target credentials |
| */ |
| static int |
| dtrace_priv_proc_common_zone(dtrace_state_t *state) |
| { |
| cred_t *cr, *s_cr = state->dts_cred.dcr_cred; |
| |
| /* |
| * We should always have a non-NULL state cred here, since if cred |
| * is null (anonymous tracing), we fast-path bypass this routine. |
| */ |
| ASSERT(s_cr != NULL); |
| |
| if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone) |
| return (1); |
| |
| return (0); |
| } |
| |
| /* |
| * This privilege check should be used by actions and subroutines to |
| * verify that the process has not setuid or changed credentials. |
| */ |
| static int |
| dtrace_priv_proc_common_nocd() |
| { |
| proc_t *proc; |
| |
| if ((proc = ttoproc(curthread)) != NULL && |
| !(proc->p_flag & SNOCD)) |
| return (1); |
| |
| return (0); |
| } |
| |
| static int |
| dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate) |
| { |
| int action = state->dts_cred.dcr_action; |
| |
| if (!(mstate->dtms_access & DTRACE_ACCESS_PROC)) |
| goto bad; |
| |
| if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) && |
| dtrace_priv_proc_common_zone(state) == 0) |
| goto bad; |
| |
| if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) && |
| dtrace_priv_proc_common_user(state) == 0) |
| goto bad; |
| |
| if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) && |
| dtrace_priv_proc_common_nocd() == 0) |
| goto bad; |
| |
| return (1); |
| |
| bad: |
| cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; |
| |
| return (0); |
| } |
| |
| static int |
| dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate) |
| { |
| if (mstate->dtms_access & DTRACE_ACCESS_PROC) { |
| if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL) |
| return (1); |
| |
| if (dtrace_priv_proc_common_zone(state) && |
| dtrace_priv_proc_common_user(state) && |
| dtrace_priv_proc_common_nocd()) |
| return (1); |
| } |
| |
| cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; |
| |
| return (0); |
| } |
| |
| static int |
| dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate) |
| { |
| if ((mstate->dtms_access & DTRACE_ACCESS_PROC) && |
| (state->dts_cred.dcr_action & DTRACE_CRA_PROC)) |
| return (1); |
| |
| cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV; |
| |
| return (0); |
| } |
| |
| static int |
| dtrace_priv_kernel(dtrace_state_t *state) |
| { |
| if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL) |
| return (1); |
| |
| cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; |
| |
| return (0); |
| } |
| |
| static int |
| dtrace_priv_kernel_destructive(dtrace_state_t *state) |
| { |
| if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE) |
| return (1); |
| |
| cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV; |
| |
| return (0); |
| } |
| |
| /* |
| * Determine if the dte_cond of the specified ECB allows for processing of |
| * the current probe to continue. Note that this routine may allow continued |
| * processing, but with access(es) stripped from the mstate's dtms_access |
| * field. |
| */ |
| static int |
| dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate, |
| dtrace_ecb_t *ecb) |
| { |
| dtrace_probe_t *probe = ecb->dte_probe; |
| dtrace_provider_t *prov = probe->dtpr_provider; |
| dtrace_pops_t *pops = &prov->dtpv_pops; |
| int mode = DTRACE_MODE_NOPRIV_DROP; |
| |
| ASSERT(ecb->dte_cond); |
| |
| if (pops->dtps_mode != NULL) { |
| mode = pops->dtps_mode(prov->dtpv_arg, |
| probe->dtpr_id, probe->dtpr_arg); |
| |
| ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL)); |
| ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT | |
| DTRACE_MODE_NOPRIV_DROP)); |
| } |
| |
| /* |
| * If the dte_cond bits indicate that this consumer is only allowed to |
| * see user-mode firings of this probe, check that the probe was fired |
| * while in a user context. If that's not the case, use the policy |
| * specified by the provider to determine if we drop the probe or |
| * merely restrict operation. |
| */ |
| if (ecb->dte_cond & DTRACE_COND_USERMODE) { |
| ASSERT(mode != DTRACE_MODE_NOPRIV_DROP); |
| |
| if (!(mode & DTRACE_MODE_USER)) { |
| if (mode & DTRACE_MODE_NOPRIV_DROP) |
| return (0); |
| |
| mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; |
| } |
| } |
| |
| /* |
| * This is more subtle than it looks. We have to be absolutely certain |
| * that CRED() isn't going to change out from under us so it's only |
| * legit to examine that structure if we're in constrained situations. |
| * Currently, the only times we'll this check is if a non-super-user |
| * has enabled the profile or syscall providers -- providers that |
| * allow visibility of all processes. For the profile case, the check |
| * above will ensure that we're examining a user context. |
| */ |
| if (ecb->dte_cond & DTRACE_COND_OWNER) { |
| cred_t *cr; |
| cred_t *s_cr = state->dts_cred.dcr_cred; |
| proc_t *proc; |
| |
| ASSERT(s_cr != NULL); |
| |
| if ((cr = CRED()) == NULL || |
| s_cr->cr_uid != cr->cr_uid || |
| s_cr->cr_uid != cr->cr_ruid || |
| s_cr->cr_uid != cr->cr_suid || |
| s_cr->cr_gid != cr->cr_gid || |
| s_cr->cr_gid != cr->cr_rgid || |
| s_cr->cr_gid != cr->cr_sgid || |
| (proc = ttoproc(curthread)) == NULL || |
| (proc->p_flag & SNOCD)) { |
| if (mode & DTRACE_MODE_NOPRIV_DROP) |
| return (0); |
| |
| mstate->dtms_access &= ~DTRACE_ACCESS_PROC; |
| } |
| } |
| |
| /* |
| * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not |
| * in our zone, check to see if our mode policy is to restrict rather |
| * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC |
| * and DTRACE_ACCESS_ARGS |
| */ |
| if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) { |
| cred_t *cr; |
| cred_t *s_cr = state->dts_cred.dcr_cred; |
| |
| ASSERT(s_cr != NULL); |
| |
| if ((cr = CRED()) == NULL || |
| s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) { |
| if (mode & DTRACE_MODE_NOPRIV_DROP) |
| return (0); |
| |
| mstate->dtms_access &= |
| ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS); |
| } |
| } |
| |
| /* |
| * By merits of being in this code path at all, we have limited |
| * privileges. If the provider has indicated that limited privileges |
| * are to denote restricted operation, strip off the ability to access |
| * arguments. |
| */ |
| if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT) |
| mstate->dtms_access &= ~DTRACE_ACCESS_ARGS; |
| |
| return (1); |
| } |
| |
| /* |
| * Note: not called from probe context. This function is called |
| * asynchronously (and at a regular interval) from outside of probe context to |
| * clean the dirty dynamic variable lists on all CPUs. Dynamic variable |
| * cleaning is explained in detail in <sys/dtrace_impl.h>. |
| */ |
| void |
| dtrace_dynvar_clean(dtrace_dstate_t *dstate) |
| { |
| dtrace_dynvar_t *dirty; |
| dtrace_dstate_percpu_t *dcpu; |
| dtrace_dynvar_t **rinsep; |
| int i, j, work = 0; |
| |
| for (i = 0; i < NCPU; i++) { |
| dcpu = &dstate->dtds_percpu[i]; |
| rinsep = &dcpu->dtdsc_rinsing; |
| |
| /* |
| * If the dirty list is NULL, there is no dirty work to do. |
| */ |
| if (dcpu->dtdsc_dirty == NULL) |
| continue; |
| |
| if (dcpu->dtdsc_rinsing != NULL) { |
| /* |
| * If the rinsing list is non-NULL, then it is because |
| * this CPU was selected to accept another CPU's |
| * dirty list -- and since that time, dirty buffers |
| * have accumulated. This is a highly unlikely |
| * condition, but we choose to ignore the dirty |
| * buffers -- they'll be picked up a future cleanse. |
| */ |
| continue; |
| } |
| |
| if (dcpu->dtdsc_clean != NULL) { |
| /* |
| * If the clean list is non-NULL, then we're in a |
| * situation where a CPU has done deallocations (we |
| * have a non-NULL dirty list) but no allocations (we |
| * also have a non-NULL clean list). We can't simply |
| * move the dirty list into the clean list on this |
| * CPU, yet we also don't want to allow this condition |
| * to persist, lest a short clean list prevent a |
| * massive dirty list from being cleaned (which in |
| * turn could lead to otherwise avoidable dynamic |
| * drops). To deal with this, we look for some CPU |
| * with a NULL clean list, NULL dirty list, and NULL |
| * rinsing list -- and then we borrow this CPU to |
| * rinse our dirty list. |
| */ |
| for (j = 0; j < NCPU; j++) { |
| dtrace_dstate_percpu_t *rinser; |
| |
| rinser = &dstate->dtds_percpu[j]; |
| |
| if (rinser->dtdsc_rinsing != NULL) |
| continue; |
| |
| if (rinser->dtdsc_dirty != NULL) |
| continue; |
| |
| if (rinser->dtdsc_clean != NULL) |
| continue; |
| |
| rinsep = &rinser->dtdsc_rinsing; |
| break; |
| } |
| |
| if (j == NCPU) { |
| /* |
| * We were unable to find another CPU that |
| * could accept this dirty list -- we are |
| * therefore unable to clean it now. |
| */ |
| dtrace_dynvar_failclean++; |
| continue; |
| } |
| } |
| |
| work = 1; |
| |
| /* |
| * Atomically move the dirty list aside. |
| */ |
| do { |
| dirty = dcpu->dtdsc_dirty; |
| |
| /* |
| * Before we zap the dirty list, set the rinsing list. |
| * (This allows for a potential assertion in |
| * dtrace_dynvar(): if a free dynamic variable appears |
| * on a hash chain, either the dirty list or the |
| * rinsing list for some CPU must be non-NULL.) |
| */ |
| *rinsep = dirty; |
| dtrace_membar_producer(); |
| } while (dtrace_casptr(&dcpu->dtdsc_dirty, |
| dirty, NULL) != dirty); |
| } |
| |
| if (!work) { |
| /* |
| * We have no work to do; we can simply return. |
| */ |
| return; |
| } |
| |
| dtrace_sync(); |
| |
| for (i = 0; i < NCPU; i++) { |
| dcpu = &dstate->dtds_percpu[i]; |
| |
| if (dcpu->dtdsc_rinsing == NULL) |
| continue; |
| |
| /* |
| * We are now guaranteed that no hash chain contains a pointer |
| * into this dirty list; we can make it clean. |
| */ |
| ASSERT(dcpu->dtdsc_clean == NULL); |
| dcpu->dtdsc_clean = dcpu->dtdsc_rinsing; |
| dcpu->dtdsc_rinsing = NULL; |
| } |
| |
| /* |
| * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make |
| * sure that all CPUs have seen all of the dtdsc_clean pointers. |
| * This prevents a race whereby a CPU incorrectly decides that |
| * the state should be something other than DTRACE_DSTATE_CLEAN |
| * after dtrace_dynvar_clean() has completed. |
| */ |
| dtrace_sync(); |
| |
| dstate->dtds_state = DTRACE_DSTATE_CLEAN; |
| } |
| |
| /* |
| * Depending on the value of the op parameter, this function looks-up, |
| * allocates or deallocates an arbitrarily-keyed dynamic variable. If an |
| * allocation is requested, this function will return a pointer to a |
| * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no |
| * variable can be allocated. If NULL is returned, the appropriate counter |
| * will be incremented. |
| */ |
| dtrace_dynvar_t * |
| dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys, |
| dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op, |
| dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) |
| { |
| uint64_t hashval = DTRACE_DYNHASH_VALID; |
| dtrace_dynhash_t *hash = dstate->dtds_hash; |
| dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL; |
| processorid_t me = CPU->cpu_id, cpu = me; |
| dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me]; |
| size_t bucket, ksize; |
| size_t chunksize = dstate->dtds_chunksize; |
| uintptr_t kdata, lock, nstate; |
| uint_t i; |
| |
| ASSERT(nkeys != 0); |
| |
| /* |
| * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time" |
| * algorithm. For the by-value portions, we perform the algorithm in |
| * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a |
| * bit, and seems to have only a minute effect on distribution. For |
| * the by-reference data, we perform "One-at-a-time" iterating (safely) |
| * over each referenced byte. It's painful to do this, but it's much |
| * better than pathological hash distribution. The efficacy of the |
| * hashing algorithm (and a comparison with other algorithms) may be |
| * found by running the ::dtrace_dynstat MDB dcmd. |
| */ |
| for (i = 0; i < nkeys; i++) { |
| if (key[i].dttk_size == 0) { |
| uint64_t val = key[i].dttk_value; |
| |
| hashval += (val >> 48) & 0xffff; |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| |
| hashval += (val >> 32) & 0xffff; |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| |
| hashval += (val >> 16) & 0xffff; |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| |
| hashval += val & 0xffff; |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| } else { |
| /* |
| * This is incredibly painful, but it beats the hell |
| * out of the alternative. |
| */ |
| uint64_t j, size = key[i].dttk_size; |
| uintptr_t base = (uintptr_t)key[i].dttk_value; |
| |
| if (!dtrace_canload(base, size, mstate, vstate)) |
| break; |
| |
| for (j = 0; j < size; j++) { |
| hashval += dtrace_load8(base + j); |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| } |
| } |
| } |
| |
| if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT)) |
| return (NULL); |
| |
| hashval += (hashval << 3); |
| hashval ^= (hashval >> 11); |
| hashval += (hashval << 15); |
| |
| /* |
| * There is a remote chance (ideally, 1 in 2^31) that our hashval |
| * comes out to be one of our two sentinel hash values. If this |
| * actually happens, we set the hashval to be a value known to be a |
| * non-sentinel value. |
| */ |
| if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK) |
| hashval = DTRACE_DYNHASH_VALID; |
| |
| /* |
| * Yes, it's painful to do a divide here. If the cycle count becomes |
| * important here, tricks can be pulled to reduce it. (However, it's |
| * critical that hash collisions be kept to an absolute minimum; |
| * they're much more painful than a divide.) It's better to have a |
| * solution that generates few collisions and still keeps things |
| * relatively simple. |
| */ |
| bucket = hashval % dstate->dtds_hashsize; |
| |
| if (op == DTRACE_DYNVAR_DEALLOC) { |
| volatile uintptr_t *lockp = &hash[bucket].dtdh_lock; |
| |
| for (;;) { |
| while ((lock = *lockp) & 1) |
| continue; |
| |
| if (dtrace_casptr((void *)lockp, |
| (void *)lock, (void *)(lock + 1)) == (void *)lock) |
| break; |
| } |
| |
| dtrace_membar_producer(); |
| } |
| |
| top: |
| prev = NULL; |
| lock = hash[bucket].dtdh_lock; |
| |
| dtrace_membar_consumer(); |
| |
| start = hash[bucket].dtdh_chain; |
| ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK || |
| start->dtdv_hashval != DTRACE_DYNHASH_FREE || |
| op != DTRACE_DYNVAR_DEALLOC)); |
| |
| for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) { |
| dtrace_tuple_t *dtuple = &dvar->dtdv_tuple; |
| dtrace_key_t *dkey = &dtuple->dtt_key[0]; |
| |
| if (dvar->dtdv_hashval != hashval) { |
| if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) { |
| /* |
| * We've reached the sink, and therefore the |
| * end of the hash chain; we can kick out of |
| * the loop knowing that we have seen a valid |
| * snapshot of state. |
| */ |
| ASSERT(dvar->dtdv_next == NULL); |
| ASSERT(dvar == &dtrace_dynhash_sink); |
| break; |
| } |
| |
| if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) { |
| /* |
| * We've gone off the rails: somewhere along |
| * the line, one of the members of this hash |
| * chain was deleted. Note that we could also |
| * detect this by simply letting this loop run |
| * to completion, as we would eventually hit |
| * the end of the dirty list. However, we |
| * want to avoid running the length of the |
| * dirty list unnecessarily (it might be quite |
| * long), so we catch this as early as |
| * possible by detecting the hash marker. In |
| * this case, we simply set dvar to NULL and |
| * break; the conditional after the loop will |
| * send us back to top. |
| */ |
| dvar = NULL; |
| break; |
| } |
| |
| goto next; |
| } |
| |
| if (dtuple->dtt_nkeys != nkeys) |
| goto next; |
| |
| for (i = 0; i < nkeys; i++, dkey++) { |
| if (dkey->dttk_size != key[i].dttk_size) |
| goto next; /* size or type mismatch */ |
| |
| if (dkey->dttk_size != 0) { |
| if (dtrace_bcmp( |
| (void *)(uintptr_t)key[i].dttk_value, |
| (void *)(uintptr_t)dkey->dttk_value, |
| dkey->dttk_size)) |
| goto next; |
| } else { |
| if (dkey->dttk_value != key[i].dttk_value) |
| goto next; |
| } |
| } |
| |
| if (op != DTRACE_DYNVAR_DEALLOC) |
| return (dvar); |
| |
| ASSERT(dvar->dtdv_next == NULL || |
| dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE); |
| |
| if (prev != NULL) { |
| ASSERT(hash[bucket].dtdh_chain != dvar); |
| ASSERT(start != dvar); |
| ASSERT(prev->dtdv_next == dvar); |
| prev->dtdv_next = dvar->dtdv_next; |
| } else { |
| if (dtrace_casptr(&hash[bucket].dtdh_chain, |
| start, dvar->dtdv_next) != start) { |
| /* |
| * We have failed to atomically swing the |
| * hash table head pointer, presumably because |
| * of a conflicting allocation on another CPU. |
| * We need to reread the hash chain and try |
| * again. |
| */ |
| goto top; |
| } |
| } |
| |
| dtrace_membar_producer(); |
| |
| /* |
| * Now set the hash value to indicate that it's free. |
| */ |
| ASSERT(hash[bucket].dtdh_chain != dvar); |
| dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; |
| |
| dtrace_membar_producer(); |
| |
| /* |
| * Set the next pointer to point at the dirty list, and |
| * atomically swing the dirty pointer to the newly freed dvar. |
| */ |
| do { |
| next = dcpu->dtdsc_dirty; |
| dvar->dtdv_next = next; |
| } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next); |
| |
| /* |
| * Finally, unlock this hash bucket. |
| */ |
| ASSERT(hash[bucket].dtdh_lock == lock); |
| ASSERT(lock & 1); |
| hash[bucket].dtdh_lock++; |
| |
| return (NULL); |
| next: |
| prev = dvar; |
| continue; |
| } |
| |
| if (dvar == NULL) { |
| /* |
| * If dvar is NULL, it is because we went off the rails: |
| * one of the elements that we traversed in the hash chain |
| * was deleted while we were traversing it. In this case, |
| * we assert that we aren't doing a dealloc (deallocs lock |
| * the hash bucket to prevent themselves from racing with |
| * one another), and retry the hash chain traversal. |
| */ |
| ASSERT(op != DTRACE_DYNVAR_DEALLOC); |
| goto top; |
| } |
| |
| if (op != DTRACE_DYNVAR_ALLOC) { |
| /* |
| * If we are not to allocate a new variable, we want to |
| * return NULL now. Before we return, check that the value |
| * of the lock word hasn't changed. If it has, we may have |
| * seen an inconsistent snapshot. |
| */ |
| if (op == DTRACE_DYNVAR_NOALLOC) { |
| if (hash[bucket].dtdh_lock != lock) |
| goto top; |
| } else { |
| ASSERT(op == DTRACE_DYNVAR_DEALLOC); |
| ASSERT(hash[bucket].dtdh_lock == lock); |
| ASSERT(lock & 1); |
| hash[bucket].dtdh_lock++; |
| } |
| |
| return (NULL); |
| } |
| |
| /* |
| * We need to allocate a new dynamic variable. The size we need is the |
| * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the |
| * size of any auxiliary key data (rounded up to 8-byte alignment) plus |
| * the size of any referred-to data (dsize). We then round the final |
| * size up to the chunksize for allocation. |
| */ |
| for (ksize = 0, i = 0; i < nkeys; i++) |
| ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t)); |
| |
| /* |
| * This should be pretty much impossible, but could happen if, say, |
| * strange DIF specified the tuple. Ideally, this should be an |
| * assertion and not an error condition -- but that requires that the |
| * chunksize calculation in dtrace_difo_chunksize() be absolutely |
| * bullet-proof. (That is, it must not be able to be fooled by |
| * malicious DIF.) Given the lack of backwards branches in DIF, |
| * solving this would presumably not amount to solving the Halting |
| * Problem -- but it still seems awfully hard. |
| */ |
| if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) + |
| ksize + dsize > chunksize) { |
| dcpu->dtdsc_drops++; |
| return (NULL); |
| } |
| |
| nstate = DTRACE_DSTATE_EMPTY; |
| |
| do { |
| retry: |
| free = dcpu->dtdsc_free; |
| |
| if (free == NULL) { |
| dtrace_dynvar_t *clean = dcpu->dtdsc_clean; |
| void *rval; |
| |
| if (clean == NULL) { |
| /* |
| * We're out of dynamic variable space on |
| * this CPU. Unless we have tried all CPUs, |
| * we'll try to allocate from a different |
| * CPU. |
| */ |
| switch (dstate->dtds_state) { |
| case DTRACE_DSTATE_CLEAN: { |
| void *sp = &dstate->dtds_state; |
| |
| if (++cpu >= NCPU) |
| cpu = 0; |
| |
| if (dcpu->dtdsc_dirty != NULL && |
| nstate == DTRACE_DSTATE_EMPTY) |
| nstate = DTRACE_DSTATE_DIRTY; |
| |
| if (dcpu->dtdsc_rinsing != NULL) |
| nstate = DTRACE_DSTATE_RINSING; |
| |
| dcpu = &dstate->dtds_percpu[cpu]; |
| |
| if (cpu != me) |
| goto retry; |
| |
| (void) dtrace_cas32(sp, |
| DTRACE_DSTATE_CLEAN, nstate); |
| |
| /* |
| * To increment the correct bean |
| * counter, take another lap. |
| */ |
| goto retry; |
| } |
| |
| case DTRACE_DSTATE_DIRTY: |
| dcpu->dtdsc_dirty_drops++; |
| break; |
| |
| case DTRACE_DSTATE_RINSING: |
| dcpu->dtdsc_rinsing_drops++; |
| break; |
| |
| case DTRACE_DSTATE_EMPTY: |
| dcpu->dtdsc_drops++; |
| break; |
| } |
| |
| DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP); |
| return (NULL); |
| } |
| |
| /* |
| * The clean list appears to be non-empty. We want to |
| * move the clean list to the free list; we start by |
| * moving the clean pointer aside. |
| */ |
| if (dtrace_casptr(&dcpu->dtdsc_clean, |
| clean, NULL) != clean) { |
| /* |
| * We are in one of two situations: |
| * |
| * (a) The clean list was switched to the |
| * free list by another CPU. |
| * |
| * (b) The clean list was added to by the |
| * cleansing cyclic. |
| * |
| * In either of these situations, we can |
| * just reattempt the free list allocation. |
| */ |
| goto retry; |
| } |
| |
| ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE); |
| |
| /* |
| * Now we'll move the clean list to our free list. |
| * It's impossible for this to fail: the only way |
| * the free list can be updated is through this |
| * code path, and only one CPU can own the clean list. |
| * Thus, it would only be possible for this to fail if |
| * this code were racing with dtrace_dynvar_clean(). |
| * (That is, if dtrace_dynvar_clean() updated the clean |
| * list, and we ended up racing to update the free |
| * list.) This race is prevented by the dtrace_sync() |
| * in dtrace_dynvar_clean() -- which flushes the |
| * owners of the clean lists out before resetting |
| * the clean lists. |
| */ |
| dcpu = &dstate->dtds_percpu[me]; |
| rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean); |
| ASSERT(rval == NULL); |
| goto retry; |
| } |
| |
| dvar = free; |
| new_free = dvar->dtdv_next; |
| } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free); |
| |
| /* |
| * We have now allocated a new chunk. We copy the tuple keys into the |
| * tuple array and copy any referenced key data into the data space |
| * following the tuple array. As we do this, we relocate dttk_value |
| * in the final tuple to point to the key data address in the chunk. |
| */ |
| kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys]; |
| dvar->dtdv_data = (void *)(kdata + ksize); |
| dvar->dtdv_tuple.dtt_nkeys = nkeys; |
| |
| for (i = 0; i < nkeys; i++) { |
| dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i]; |
| size_t kesize = key[i].dttk_size; |
| |
| if (kesize != 0) { |
| dtrace_bcopy( |
| (const void *)(uintptr_t)key[i].dttk_value, |
| (void *)kdata, kesize); |
| dkey->dttk_value = kdata; |
| kdata += P2ROUNDUP(kesize, sizeof (uint64_t)); |
| } else { |
| dkey->dttk_value = key[i].dttk_value; |
| } |
| |
| dkey->dttk_size = kesize; |
| } |
| |
| ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE); |
| dvar->dtdv_hashval = hashval; |
| dvar->dtdv_next = start; |
| |
| if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start) |
| return (dvar); |
| |
| /* |
| * The cas has failed. Either another CPU is adding an element to |
| * this hash chain, or another CPU is deleting an element from this |
| * hash chain. The simplest way to deal with both of these cases |
| * (though not necessarily the most efficient) is to free our |
| * allocated block and re-attempt it all. Note that the free is |
| * to the dirty list and _not_ to the free list. This is to prevent |
| * races with allocators, above. |
| */ |
| dvar->dtdv_hashval = DTRACE_DYNHASH_FREE; |
| |
| dtrace_membar_producer(); |
| |
| do { |
| free = dcpu->dtdsc_dirty; |
| dvar->dtdv_next = free; |
| } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free); |
| |
| goto top; |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg) |
| { |
| if ((int64_t)nval < (int64_t)*oval) |
| *oval = nval; |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg) |
| { |
| if ((int64_t)nval > (int64_t)*oval) |
| *oval = nval; |
| } |
| |
| static void |
| dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr) |
| { |
| int i, zero = DTRACE_QUANTIZE_ZEROBUCKET; |
| int64_t val = (int64_t)nval; |
| |
| if (val < 0) { |
| for (i = 0; i < zero; i++) { |
| if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) { |
| quanta[i] += incr; |
| return; |
| } |
| } |
| } else { |
| for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) { |
| if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) { |
| quanta[i - 1] += incr; |
| return; |
| } |
| } |
| |
| quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr; |
| return; |
| } |
| |
| ASSERT(0); |
| } |
| |
| static void |
| dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr) |
| { |
| uint64_t arg = *lquanta++; |
| int32_t base = DTRACE_LQUANTIZE_BASE(arg); |
| uint16_t step = DTRACE_LQUANTIZE_STEP(arg); |
| uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg); |
| int32_t val = (int32_t)nval, level; |
| |
| ASSERT(step != 0); |
| ASSERT(levels != 0); |
| |
| if (val < base) { |
| /* |
| * This is an underflow. |
| */ |
| lquanta[0] += incr; |
| return; |
| } |
| |
| level = (val - base) / step; |
| |
| if (level < levels) { |
| lquanta[level + 1] += incr; |
| return; |
| } |
| |
| /* |
| * This is an overflow. |
| */ |
| lquanta[levels + 1] += incr; |
| } |
| |
| static int |
| dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low, |
| uint16_t high, uint16_t nsteps, int64_t value) |
| { |
| int64_t this = 1, last, next; |
| int base = 1, order; |
| |
| ASSERT(factor <= nsteps); |
| ASSERT(nsteps % factor == 0); |
| |
| for (order = 0; order < low; order++) |
| this *= factor; |
| |
| /* |
| * If our value is less than our factor taken to the power of the |
| * low order of magnitude, it goes into the zeroth bucket. |
| */ |
| if (value < (last = this)) |
| return (0); |
| |
| for (this *= factor; order <= high; order++) { |
| int nbuckets = this > nsteps ? nsteps : this; |
| |
| if ((next = this * factor) < this) { |
| /* |
| * We should not generally get log/linear quantizations |
| * with a high magnitude that allows 64-bits to |
| * overflow, but we nonetheless protect against this |
| * by explicitly checking for overflow, and clamping |
| * our value accordingly. |
| */ |
| value = this - 1; |
| } |
| |
| if (value < this) { |
| /* |
| * If our value lies within this order of magnitude, |
| * determine its position by taking the offset within |
| * the order of magnitude, dividing by the bucket |
| * width, and adding to our (accumulated) base. |
| */ |
| return (base + (value - last) / (this / nbuckets)); |
| } |
| |
| base += nbuckets - (nbuckets / factor); |
| last = this; |
| this = next; |
| } |
| |
| /* |
| * Our value is greater than or equal to our factor taken to the |
| * power of one plus the high magnitude -- return the top bucket. |
| */ |
| return (base); |
| } |
| |
| static void |
| dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr) |
| { |
| uint64_t arg = *llquanta++; |
| uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg); |
| uint16_t low = DTRACE_LLQUANTIZE_LOW(arg); |
| uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg); |
| uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg); |
| |
| llquanta[dtrace_aggregate_llquantize_bucket(factor, |
| low, high, nsteps, nval)] += incr; |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) |
| { |
| data[0]++; |
| data[1] += nval; |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg) |
| { |
| int64_t snval = (int64_t)nval; |
| uint64_t tmp[2]; |
| |
| data[0]++; |
| data[1] += nval; |
| |
| /* |
| * What we want to say here is: |
| * |
| * data[2] += nval * nval; |
| * |
| * But given that nval is 64-bit, we could easily overflow, so |
| * we do this as 128-bit arithmetic. |
| */ |
| if (snval < 0) |
| snval = -snval; |
| |
| dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp); |
| dtrace_add_128(data + 2, tmp, data + 2); |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg) |
| { |
| *oval = *oval + 1; |
| } |
| |
| /*ARGSUSED*/ |
| static void |
| dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg) |
| { |
| *oval += nval; |
| } |
| |
| /* |
| * Aggregate given the tuple in the principal data buffer, and the aggregating |
| * action denoted by the specified dtrace_aggregation_t. The aggregation |
| * buffer is specified as the buf parameter. This routine does not return |
| * failure; if there is no space in the aggregation buffer, the data will be |
| * dropped, and a corresponding counter incremented. |
| */ |
| static void |
| dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf, |
| intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg) |
| { |
| dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec; |
| uint32_t i, ndx, size, fsize; |
| uint32_t align = sizeof (uint64_t) - 1; |
| dtrace_aggbuffer_t *agb; |
| dtrace_aggkey_t *key; |
| uint32_t hashval = 0, limit, isstr; |
| caddr_t tomax, data, kdata; |
| dtrace_actkind_t action; |
| dtrace_action_t *act; |
| uintptr_t offs; |
| |
| if (buf == NULL) |
| return; |
| |
| if (!agg->dtag_hasarg) { |
| /* |
| * Currently, only quantize() and lquantize() take additional |
| * arguments, and they have the same semantics: an increment |
| * value that defaults to 1 when not present. If additional |
| * aggregating actions take arguments, the setting of the |
| * default argument value will presumably have to become more |
| * sophisticated... |
| */ |
| arg = 1; |
| } |
| |
| action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION; |
| size = rec->dtrd_offset - agg->dtag_base; |
| fsize = size + rec->dtrd_size; |
| |
| ASSERT(dbuf->dtb_tomax != NULL); |
| data = dbuf->dtb_tomax + offset + agg->dtag_base; |
| |
| if ((tomax = buf->dtb_tomax) == NULL) { |
| dtrace_buffer_drop(buf); |
| return; |
| } |
| |
| /* |
| * The metastructure is always at the bottom of the buffer. |
| */ |
| agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size - |
| sizeof (dtrace_aggbuffer_t)); |
| |
| if (buf->dtb_offset == 0) { |
| /* |
| * We just kludge up approximately 1/8th of the size to be |
| * buckets. If this guess ends up being routinely |
| * off-the-mark, we may need to dynamically readjust this |
| * based on past performance. |
| */ |
| uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t); |
| |
| if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) < |
| (uintptr_t)tomax || hashsize == 0) { |
| /* |
| * We've been given a ludicrously small buffer; |
| * increment our drop count and leave. |
| */ |
| dtrace_buffer_drop(buf); |
| return; |
| } |
| |
| /* |
| * And now, a pathetic attempt to try to get a an odd (or |
| * perchance, a prime) hash size for better hash distribution. |
| */ |
| if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3)) |
| hashsize -= DTRACE_AGGHASHSIZE_SLEW; |
| |
| agb->dtagb_hashsize = hashsize; |
| agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb - |
| agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *)); |
| agb->dtagb_free = (uintptr_t)agb->dtagb_hash; |
| |
| for (i = 0; i < agb->dtagb_hashsize; i++) |
| agb->dtagb_hash[i] = NULL; |
| } |
| |
| ASSERT(agg->dtag_first != NULL); |
| ASSERT(agg->dtag_first->dta_intuple); |
| |
| /* |
| * Calculate the hash value based on the key. Note that we _don't_ |
| * include the aggid in the hashing (but we will store it as part of |
| * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time" |
| * algorithm: a simple, quick algorithm that has no known funnels, and |
| * gets good distribution in practice. The efficacy of the hashing |
| * algorithm (and a comparison with other algorithms) may be found by |
| * running the ::dtrace_aggstat MDB dcmd. |
| */ |
| for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { |
| i = act->dta_rec.dtrd_offset - agg->dtag_base; |
| limit = i + act->dta_rec.dtrd_size; |
| ASSERT(limit <= size); |
| isstr = DTRACEACT_ISSTRING(act); |
| |
| for (; i < limit; i++) { |
| hashval += data[i]; |
| hashval += (hashval << 10); |
| hashval ^= (hashval >> 6); |
| |
| if (isstr && data[i] == '\0') |
| break; |
| } |
| } |
| |
| hashval += (hashval << 3); |
| hashval ^= (hashval >> 11); |
| hashval += (hashval << 15); |
| |
| /* |
| * Yes, the divide here is expensive -- but it's generally the least |
| * of the performance issues given the amount of data that we iterate |
| * over to compute hash values, compare data, etc. |
| */ |
| ndx = hashval % agb->dtagb_hashsize; |
| |
| for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) { |
| ASSERT((caddr_t)key >= tomax); |
| ASSERT((caddr_t)key < tomax + buf->dtb_size); |
| |
| if (hashval != key->dtak_hashval || key->dtak_size != size) |
| continue; |
| |
| kdata = key->dtak_data; |
| ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size); |
| |
| for (act = agg->dtag_first; act->dta_intuple; |
| act = act->dta_next) { |
| i = act->dta_rec.dtrd_offset - agg->dtag_base; |
| limit = i + act->dta_rec.dtrd_size; |
| ASSERT(limit <= size); |
| isstr = DTRACEACT_ISSTRING(act); |
| |
| for (; i < limit; i++) { |
| if (kdata[i] != data[i]) |
| goto next; |
| |
| if (isstr && data[i] == '\0') |
| break; |
| } |
| } |
| |
| if (action != key->dtak_action) { |
| /* |
| * We are aggregating on the same value in the same |
| * aggregation with two different aggregating actions. |
| * (This should have been picked up in the compiler, |
| * so we may be dealing with errant or devious DIF.) |
| * This is an error condition; we indicate as much, |
| * and return. |
| */ |
| DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP); |
| return; |
| } |
| |
| /* |
| * This is a hit: we need to apply the aggregator to |
| * the value at this key. |
| */ |
| agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg); |
| return; |
| next: |
| continue; |
| } |
| |
| /* |
| * We didn't find it. We need to allocate some zero-filled space, |
| * link it into the hash table appropriately, and apply the aggregator |
| * to the (zero-filled) value. |
| */ |
| offs = buf->dtb_offset; |
| while (offs & (align - 1)) |
| offs += sizeof (uint32_t); |
| |
| /* |
| * If we don't have enough room to both allocate a new key _and_ |
| * its associated data, increment the drop count and return. |
| */ |
| if ((uintptr_t)tomax + offs + fsize > |
| agb->dtagb_free - sizeof (dtrace_aggkey_t)) { |
| dtrace_buffer_drop(buf); |
| return; |
| } |
| |
| /*CONSTCOND*/ |
| ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1))); |
| key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t)); |
| agb->dtagb_free -= sizeof (dtrace_aggkey_t); |
| |
| key->dtak_data = kdata = tomax + offs; |
| buf->dtb_offset = offs + fsize; |
| |
| /* |
| * Now copy the data across. |
| */ |
| *((dtrace_aggid_t *)kdata) = agg->dtag_id; |
| |
| for (i = sizeof (dtrace_aggid_t); i < size; i++) |
| kdata[i] = data[i]; |
| |
| /* |
| * Because strings are not zeroed out by default, we need to iterate |
| * looking for actions that store strings, and we need to explicitly |
| * pad these strings out with zeroes. |
| */ |
| for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) { |
| int nul; |
| |
| if (!DTRACEACT_ISSTRING(act)) |
| continue; |
| |
| i = act->dta_rec.dtrd_offset - agg->dtag_base; |
| limit = i + act->dta_rec.dtrd_size; |
| ASSERT(limit <= size); |
| |
| for (nul = 0; i < limit; i++) { |
| if (nul) { |
| kdata[i] = '\0'; |
| continue; |
| } |
| |
| if (data[i] != '\0') |
| continue; |
| |
| nul = 1; |
| } |
| } |
| |
| for (i = size; i < fsize; i++) |
| kdata[i] = 0; |
| |
| key->dtak_hashval = hashval; |
| key->dtak_size = size; |
| key->dtak_action = action; |
| key->dtak_next = agb->dtagb_hash[ndx]; |
| agb->dtagb_hash[ndx] = key; |
| |
| /* |
| * Finally, apply the aggregator. |
| */ |
| *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial; |
| agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg); |
| } |
| |
| /* |
| * Given consumer state, this routine finds a speculation in the INACTIVE |
| * state and transitions it into the ACTIVE state. If there is no speculation |
| * in the INACTIVE state, 0 is returned. In this case, no error counter is |
| * incremented -- it is up to the caller to take appropriate action. |
| */ |
| static int |
| dtrace_speculation(dtrace_state_t *state) |
| { |
| int i = 0; |
| dtrace_speculation_state_t current; |
| uint32_t *stat = &state->dts_speculations_unavail, count; |
| |
| while (i < state->dts_nspeculations) { |
| dtrace_speculation_t *spec = &state->dts_speculations[i]; |
| |
| current = spec->dtsp_state; |
| |
| if (current != DTRACESPEC_INACTIVE) { |
| if (current == DTRACESPEC_COMMITTINGMANY || |
| current == DTRACESPEC_COMMITTING || |
| current == DTRACESPEC_DISCARDING) |
| stat = &state->dts_speculations_busy; |
| i++; |
| continue; |
| } |
| |
| if (dtrace_cas32((uint32_t *)&spec->dtsp_state, |
| current, DTRACESPEC_ACTIVE) == current) |
| return (i + 1); |
| } |
| |
| /* |
| * We couldn't find a speculation. If we found as much as a single |
| * busy speculation buffer, we'll attribute this failure as "busy" |
| * instead of "unavail". |
| */ |
| do { |
| count = *stat; |
| } while (dtrace_cas32(stat, count, count + 1) != count); |
| |
| return (0); |
| } |
| |
| /* |
| * This routine commits an active speculation. If the specified speculation |
| * is not in a valid state to perform a commit(), this routine will silently do |
| * nothing. The state of the specified speculation is transitioned according |
| * to the state transition diagram outlined in <sys/dtrace_impl.h> |
| */ |
| static void |
| dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu, |
| dtrace_specid_t which) |
| { |
| dtrace_speculation_t *spec; |
| dtrace_buffer_t *src, *dest; |
| uintptr_t daddr, saddr, dlimit, slimit; |
| dtrace_speculation_state_t current, new; |
| intptr_t offs; |
| uint64_t timestamp; |
| |
| if (which == 0) |
| return; |
| |
| if (which > state->dts_nspeculations) { |
| cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP; |
| return; |
| } |
| |
| spec = &state->dts_speculations[which - 1]; |
| src = &spec->dtsp_buffer[cpu]; |
| dest = &state->dts_buffer[cpu]; |
| |
| do { |
| current = spec->dtsp_state; |
| |
| if (current == DTRACESPEC_COMMITTINGMANY) |
| break; |
| |
| switch (current) { |
| case DTRACESPEC_INACTIVE: |
| case DTRACESPEC_DISCARDING: |
| return; |
| |
| case DTRACESPEC_COMMITTING: |
| /* |
| * This is only possible if we are (a) commit()'ing |
| * without having done a prior speculate() on this CPU |
| * and (b) racing with another commit() on a different |
| * CPU. There's nothing to do -- we just assert that |
| * our offset is 0. |
| */ |
| ASSERT(src->dtb_offset == 0); |
| return; |
| |
| case DTRACESPEC_ACTIVE: |
| new = DTRACESPEC_COMMITTING; |
| break; |
| |
| case DTRACESPEC_ACTIVEONE: |
| /* |
| * This speculation is active on one CPU. If our |
| * buffer offset is non-zero, we know that the one CPU |
| * must be us. Otherwise, we are committing on a |
| * different CPU from the speculate(), and we must |
| * rely on being asynchronously cleaned. |
| */ |
| if (src->dtb_offset != 0) { |
| new = DTRACESPEC_COMMITTING; |
| break; |
| } |
| /*FALLTHROUGH*/ |
| |
| case DTRACESPEC_ACTIVEMANY: |
| new = DTRACESPEC_COMMITTINGMANY; |
| break; |
| |
| default: |
| ASSERT(0); |
| } |
| } while (dtrace_cas32((uint32_t *)&spec->dtsp_state, |
| current, new) != current); |
| |
| /* |
| * We have set the state to indicate that we are committing this |
| * speculation. Now reserve the necessary space in the destination |
| * buffer. |
| */ |
| if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset, |
| sizeof (uint64_t), state, NULL)) < 0) { |
| dtrace_buffer_drop(dest); |
| goto out; |
| } |
| |
| /* |
| * We have sufficient space to copy the speculative buffer into the |
| * primary buffer. First, modify the speculative buffer, filling |
| * in the timestamp of all entries with the current time. The data |
| * must have the commit() time rather than the time it was traced, |
| * so that all entries in the primary buffer are in timestamp order. |
| */ |
| timestamp = dtrace_gethrtime(); |
| saddr = (uintptr_t)src->dtb_tomax; |
| slimit = saddr + src->dtb_offset; |
| while (saddr < slimit) { |
| size_t size; |
| dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr; |
| |
| if (dtrh->dtrh_epid == DTRACE_EPIDNONE) { |
| saddr += sizeof (dtrace_epid_t); |
| continue; |
| } |
| ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs); |
| size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size; |
| |
| ASSERT3U(saddr + size, <=, slimit); |
| ASSERT3U(size, >=, sizeof (dtrace_rechdr_t)); |
| ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX); |
| |
| DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp); |
| |
| saddr += size; |
| } |
| |
| /* |
| * Copy the buffer across. (Note that this is a |
| * highly subobtimal bcopy(); in the unlikely event that this becomes |
| * a serious performance issue, a high-performance DTrace-specific |
| * bcopy() should obviously be invented.) |
|