blob: 6267faa7c023e05eaa053997842ffb70c8b7083c [file] [log] [blame]
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
dpad4023c2006-03-24 18:42:51 -08005 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07007 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
ahlac448962006-03-30 01:12:03 -080021
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070022/*
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -070023 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
John Levonab618542018-10-08 15:34:11 +010024 * Copyright (c) 2018, Joyent, Inc.
Matthew Ahrens82d86f42014-04-22 09:36:35 -080025 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070026 */
27
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070028/*
29 * DTrace - Dynamic Tracing for Solaris
30 *
31 * This is the implementation of the Solaris Dynamic Tracing framework
32 * (DTrace). The user-visible interface to DTrace is described at length in
33 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
34 * library, the in-kernel DTrace framework, and the DTrace providers are
35 * described in the block comments in the <sys/dtrace.h> header file. The
36 * internal architecture of DTrace is described in the block comments in the
37 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
38 * implementation very much assume mastery of all of these sources; if one has
39 * an unanswered question about the implementation, one should consult them
40 * first.
41 *
42 * The functions here are ordered roughly as follows:
43 *
44 * - Probe context functions
45 * - Probe hashing functions
46 * - Non-probe context utility functions
47 * - Matching functions
48 * - Provider-to-Framework API functions
49 * - Probe management functions
50 * - DIF object functions
51 * - Format functions
52 * - Predicate functions
53 * - ECB functions
54 * - Buffer functions
55 * - Enabling functions
56 * - DOF functions
57 * - Anonymous enabling functions
58 * - Consumer state functions
59 * - Helper functions
60 * - Hook functions
61 * - Driver cookbook functions
62 *
63 * Each group of functions begins with a block comment labelled the "DTrace
64 * [Group] Functions", allowing one to find each block by searching forward
65 * on capital-f functions.
66 */
67#include <sys/errno.h>
68#include <sys/stat.h>
69#include <sys/modctl.h>
70#include <sys/conf.h>
71#include <sys/systm.h>
72#include <sys/ddi.h>
73#include <sys/sunddi.h>
74#include <sys/cpuvar.h>
75#include <sys/kmem.h>
76#include <sys/strsubr.h>
77#include <sys/sysmacros.h>
78#include <sys/dtrace_impl.h>
79#include <sys/atomic.h>
80#include <sys/cmn_err.h>
81#include <sys/mutex_impl.h>
82#include <sys/rwlock_impl.h>
83#include <sys/ctf_api.h>
84#include <sys/panic.h>
85#include <sys/priv_impl.h>
86#include <sys/policy.h>
87#include <sys/cred_impl.h>
88#include <sys/procfs_isa.h>
89#include <sys/taskq.h>
90#include <sys/mkdev.h>
91#include <sys/kdi.h>
92#include <sys/zone.h>
brendan4edabff2007-05-22 10:56:13 -070093#include <sys/socket.h>
94#include <netinet/in.h>
Joshua M. Clulowf497f9f2012-11-27 08:55:56 +000095#include "strtolctype.h"
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070096
97/*
98 * DTrace Tunable Variables
99 *
100 * The following variables may be tuned by adding a line to /etc/system that
101 * includes both the name of the DTrace module ("dtrace") and the name of the
102 * variable. For example:
103 *
104 * set dtrace:dtrace_destructive_disallow = 1
105 *
106 * In general, the only variables that one should be tuning this way are those
107 * that affect system-wide DTrace behavior, and for which the default behavior
108 * is undesirable. Most of these variables are tunable on a per-consumer
109 * basis using DTrace options, and need not be tuned on a system-wide basis.
110 * When tuning these variables, avoid pathological values; while some attempt
111 * is made to verify the integrity of these variables, they are not considered
112 * part of the supported interface to DTrace, and they are therefore not
113 * checked comprehensively. Further, these variables should not be tuned
114 * dynamically via "mdb -kw" or other means; they should only be tuned via
115 * /etc/system.
116 */
117int dtrace_destructive_disallow = 0;
118dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
119size_t dtrace_difo_maxsize = (256 * 1024);
Bryan Cantrilld339a292012-09-18 00:50:06 +0000120dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700121size_t dtrace_statvar_maxsize = (16 * 1024);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700122size_t dtrace_actions_max = (16 * 1024);
123size_t dtrace_retain_max = 1024;
Dave Pacheco36d0f9d2012-03-30 05:41:25 -0700124dtrace_optval_t dtrace_helper_actions_max = 1024;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700125dtrace_optval_t dtrace_helper_providers_max = 32;
126dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
127size_t dtrace_strsize_default = 256;
128dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
129dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
130dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
131dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
132dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
133dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
134dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
135dtrace_optval_t dtrace_nspec_default = 1;
136dtrace_optval_t dtrace_specsize_default = 32 * 1024;
137dtrace_optval_t dtrace_stackframes_default = 20;
138dtrace_optval_t dtrace_ustackframes_default = 20;
139dtrace_optval_t dtrace_jstackframes_default = 50;
140dtrace_optval_t dtrace_jstackstrsize_default = 512;
141int dtrace_msgdsize_max = 128;
Josef 'Jeff' Sipek19449252014-04-29 13:05:25 -0400142hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700143hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
144int dtrace_devdepth_max = 32;
145int dtrace_err_verbose;
146hrtime_t dtrace_deadman_interval = NANOSEC;
147hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
148hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
Bryan Cantrillf4848002011-07-01 22:35:53 -0700149hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700150
151/*
152 * DTrace External Variables
153 *
154 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
155 * available to DTrace consumers via the backtick (`) syntax. One of these,
156 * dtrace_zero, is made deliberately so: it is provided as a source of
157 * well-known, zero-filled memory. While this variable is not documented,
158 * it is used by some translators as an implementation detail.
159 */
160const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
161
162/*
163 * DTrace Internal Variables
164 */
165static dev_info_t *dtrace_devi; /* device info */
166static vmem_t *dtrace_arena; /* probe ID arena */
167static vmem_t *dtrace_minor; /* minor number arena */
168static taskq_t *dtrace_taskq; /* task queue */
169static dtrace_probe_t **dtrace_probes; /* array of all probes */
170static int dtrace_nprobes; /* number of probes */
171static dtrace_provider_t *dtrace_provider; /* provider list */
172static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
173static int dtrace_opens; /* number of opens */
bmca1b5e532005-08-30 14:48:05 -0700174static int dtrace_helpers; /* number of helpers */
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000175static int dtrace_getf; /* number of unpriv getf()s */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700176static void *dtrace_softstate; /* softstate pointer */
177static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
178static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
179static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
180static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
181static int dtrace_toxranges; /* number of toxic ranges */
182static int dtrace_toxranges_max; /* size of toxic range array */
183static dtrace_anon_t dtrace_anon; /* anonymous enabling */
184static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
185static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
186static kthread_t *dtrace_panicked; /* panicking thread */
187static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700188static dtrace_genid_t dtrace_probegen; /* current probe generation */
189static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
190static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
Jonathan Haslamef5bb022008-09-15 10:31:28 +0100191static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
bmc586d07d2006-04-03 10:37:01 -0700192static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -0700193static int dtrace_dynvar_failclean; /* dynvars failed to clean */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700194
195/*
196 * DTrace Locking
197 * DTrace is protected by three (relatively coarse-grained) locks:
198 *
199 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
200 * including enabling state, probes, ECBs, consumer state, helper state,
201 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
202 * probe context is lock-free -- synchronization is handled via the
203 * dtrace_sync() cross call mechanism.
204 *
205 * (2) dtrace_provider_lock is required when manipulating provider state, or
206 * when provider state must be held constant.
207 *
208 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
209 * when meta provider state must be held constant.
210 *
211 * The lock ordering between these three locks is dtrace_meta_lock before
212 * dtrace_provider_lock before dtrace_lock. (In particular, there are
213 * several places where dtrace_provider_lock is held by the framework as it
214 * calls into the providers -- which then call back into the framework,
215 * grabbing dtrace_lock.)
216 *
bmca1b5e532005-08-30 14:48:05 -0700217 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
218 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
219 * role as a coarse-grained lock; it is acquired before both of these locks.
220 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
221 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
222 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
223 * acquired _between_ dtrace_provider_lock and dtrace_lock.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700224 */
225static kmutex_t dtrace_lock; /* probe state lock */
226static kmutex_t dtrace_provider_lock; /* provider state lock */
227static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
228
229/*
230 * DTrace Provider Variables
231 *
232 * These are the variables relating to DTrace as a provider (that is, the
233 * provider of the BEGIN, END, and ERROR probes).
234 */
235static dtrace_pattr_t dtrace_provider_attr = {
236{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
237{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
238{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
239{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
240{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
241};
242
243static void
244dtrace_nullop(void)
245{}
246
Jonathan Haslamb9e93c12009-02-13 07:13:13 +0000247static int
248dtrace_enable_nullop(void)
249{
250 return (0);
251}
252
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700253static dtrace_pops_t dtrace_provider_ops = {
254 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
255 (void (*)(void *, struct modctl *))dtrace_nullop,
Jonathan Haslamb9e93c12009-02-13 07:13:13 +0000256 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700257 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
258 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
259 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
260 NULL,
261 NULL,
262 NULL,
263 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
264};
265
266static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
267static dtrace_id_t dtrace_probeid_end; /* special END probe */
268dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
269
270/*
271 * DTrace Helper Tracing Variables
Bryan Cantrill902686d2012-09-05 07:23:44 +0000272 *
273 * These variables should be set dynamically to enable helper tracing. The
274 * only variables that should be set are dtrace_helptrace_enable (which should
275 * be set to a non-zero value to allocate helper tracing buffers on the next
276 * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
277 * non-zero value to deallocate helper tracing buffers on the next close of
278 * /dev/dtrace). When (and only when) helper tracing is disabled, the
279 * buffer size may also be set via dtrace_helptrace_bufsize.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700280 */
Bryan Cantrill902686d2012-09-05 07:23:44 +0000281int dtrace_helptrace_enable = 0;
282int dtrace_helptrace_disable = 0;
283int dtrace_helptrace_bufsize = 16 * 1024 * 1024;
284uint32_t dtrace_helptrace_nlocals;
285static dtrace_helptrace_t *dtrace_helptrace_buffer;
286static uint32_t dtrace_helptrace_next = 0;
287static int dtrace_helptrace_wrapped = 0;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700288
289/*
290 * DTrace Error Hashing
291 *
292 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
293 * table. This is very useful for checking coverage of tests that are
294 * expected to induce DIF or DOF processing errors, and may be useful for
295 * debugging problems in the DIF code generator or in DOF generation . The
296 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
297 */
298#ifdef DEBUG
299static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
300static const char *dtrace_errlast;
301static kthread_t *dtrace_errthread;
302static kmutex_t dtrace_errlock;
303#endif
304
305/*
306 * DTrace Macros and Constants
307 *
308 * These are various macros that are useful in various spots in the
309 * implementation, along with a few random constants that have no meaning
310 * outside of the implementation. There is no real structure to this cpp
311 * mishmash -- but is there ever?
312 */
313#define DTRACE_HASHSTR(hash, probe) \
314 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
315
316#define DTRACE_HASHNEXT(hash, probe) \
317 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
318
319#define DTRACE_HASHPREV(hash, probe) \
320 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
321
322#define DTRACE_HASHEQ(hash, lhs, rhs) \
323 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
324 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
325
326#define DTRACE_AGGHASHSIZE_SLEW 17
327
brendan4edabff2007-05-22 10:56:13 -0700328#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
329
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700330/*
331 * The key for a thread-local variable consists of the lower 61 bits of the
332 * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
333 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
334 * equal to a variable identifier. This is necessary (but not sufficient) to
335 * assure that global associative arrays never collide with thread-local
336 * variables. To guarantee that they cannot collide, we must also define the
337 * order for keying dynamic variables. That order is:
338 *
339 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
340 *
341 * Because the variable-key and the tls-key are in orthogonal spaces, there is
342 * no way for a global variable key signature to match a thread-local key
343 * signature.
344 */
345#define DTRACE_TLS_THRKEY(where) { \
346 uint_t intr = 0; \
347 uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
348 for (; actv; actv >>= 1) \
349 intr++; \
350 ASSERT(intr < (1 << 3)); \
351 (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
352 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
353}
354
ahl2b6e7622006-09-19 16:29:20 -0700355#define DT_BSWAP_8(x) ((x) & 0xff)
356#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
357#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
358#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
359
jhaslam6e0bee72008-02-07 06:05:33 -0800360#define DT_MASK_LO 0x00000000FFFFFFFFULL
361
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700362#define DTRACE_STORE(type, tomax, offset, what) \
363 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
364
Sebastien Royc93cc652012-12-28 12:32:23 -0800365#ifndef __x86
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700366#define DTRACE_ALIGNCHECK(addr, size, flags) \
367 if (addr & (size - 1)) { \
368 *flags |= CPU_DTRACE_BADALIGN; \
369 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
370 return (0); \
371 }
372#else
373#define DTRACE_ALIGNCHECK(addr, size, flags)
374#endif
375
dpe0aad1e2006-10-05 19:20:42 -0700376/*
377 * Test whether a range of memory starting at testaddr of size testsz falls
dp9de6b712006-10-13 18:01:44 -0700378 * within the range of memory described by addr, sz. We take care to avoid
379 * problems with overflow and underflow of the unsigned quantities, and
380 * disallow all negative sizes. Ranges of size 0 are allowed.
dpe0aad1e2006-10-05 19:20:42 -0700381 */
382#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000383 ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
384 (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
dp9de6b712006-10-13 18:01:44 -0700385 (testaddr) + (testsz) >= (testaddr))
386
Patrick Mooney771e39c2016-05-31 17:34:15 -0700387#define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
388do { \
389 if ((remp) != NULL) { \
390 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
391 } \
392_NOTE(CONSTCOND) } while (0)
393
394
dp9de6b712006-10-13 18:01:44 -0700395/*
396 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
397 * alloc_sz on the righthand side of the comparison in order to avoid overflow
398 * or underflow in the comparison with it. This is simpler than the INRANGE
399 * check above, because we know that the dtms_scratch_ptr is valid in the
400 * range. Allocations of size zero are allowed.
401 */
402#define DTRACE_INSCRATCH(mstate, alloc_sz) \
403 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
404 (mstate)->dtms_scratch_ptr >= (alloc_sz))
dpe0aad1e2006-10-05 19:20:42 -0700405
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700406#define DTRACE_LOADFUNC(bits) \
407/*CSTYLED*/ \
408uint##bits##_t \
409dtrace_load##bits(uintptr_t addr) \
410{ \
411 size_t size = bits / NBBY; \
412 /*CSTYLED*/ \
413 uint##bits##_t rval; \
414 int i; \
415 volatile uint16_t *flags = (volatile uint16_t *) \
416 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
417 \
418 DTRACE_ALIGNCHECK(addr, size, flags); \
419 \
420 for (i = 0; i < dtrace_toxranges; i++) { \
421 if (addr >= dtrace_toxrange[i].dtt_limit) \
422 continue; \
423 \
424 if (addr + size <= dtrace_toxrange[i].dtt_base) \
425 continue; \
426 \
427 /* \
428 * This address falls within a toxic region; return 0. \
429 */ \
430 *flags |= CPU_DTRACE_BADADDR; \
431 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
432 return (0); \
433 } \
434 \
435 *flags |= CPU_DTRACE_NOFAULT; \
436 /*CSTYLED*/ \
437 rval = *((volatile uint##bits##_t *)addr); \
438 *flags &= ~CPU_DTRACE_NOFAULT; \
439 \
bmc9eea9bb2006-11-02 12:46:05 -0800440 return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700441}
442
443#ifdef _LP64
444#define dtrace_loadptr dtrace_load64
445#else
446#define dtrace_loadptr dtrace_load32
447#endif
448
bmc586d07d2006-04-03 10:37:01 -0700449#define DTRACE_DYNHASH_FREE 0
450#define DTRACE_DYNHASH_SINK 1
451#define DTRACE_DYNHASH_VALID 2
452
Jonathan Haslamb9e93c12009-02-13 07:13:13 +0000453#define DTRACE_MATCH_FAIL -1
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700454#define DTRACE_MATCH_NEXT 0
455#define DTRACE_MATCH_DONE 1
456#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
457#define DTRACE_STATE_ALIGN 64
458
bmc187eccf2005-09-06 21:35:51 -0700459#define DTRACE_FLAGS2FLT(flags) \
460 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
461 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
462 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
463 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
464 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
465 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
466 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
467 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
jhaslamb8fac8e2007-02-20 05:32:53 -0800468 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
bmc187eccf2005-09-06 21:35:51 -0700469 DTRACEFLT_UNKNOWN)
470
bmc30ef8422005-11-29 12:13:13 -0800471#define DTRACEACT_ISSTRING(act) \
472 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
473 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
474
dpe0aad1e2006-10-05 19:20:42 -0700475static size_t dtrace_strlen(const char *, size_t);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700476static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
477static void dtrace_enabling_provide(dtrace_provider_t *);
478static int dtrace_enabling_match(dtrace_enabling_t *, int *);
479static void dtrace_enabling_matchall(void);
Bryan Cantrillf4848002011-07-01 22:35:53 -0700480static void dtrace_enabling_reap(void);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700481static dtrace_state_t *dtrace_anon_grab(void);
482static uint64_t dtrace_helper(int, dtrace_mstate_t *,
483 dtrace_state_t *, uint64_t, uint64_t);
484static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
485static void dtrace_buffer_drop(dtrace_buffer_t *);
Bryan Cantrillf4848002011-07-01 22:35:53 -0700486static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700487static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
488 dtrace_state_t *, dtrace_mstate_t *);
489static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
490 dtrace_optval_t);
491static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
ahlf4986452006-06-12 13:43:23 -0700492static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000493static int dtrace_priv_proc(dtrace_state_t *, dtrace_mstate_t *);
494static void dtrace_getf_barrier(void);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700495static int dtrace_canload_remains(uint64_t, size_t, size_t *,
496 dtrace_mstate_t *, dtrace_vstate_t *);
497static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
498 dtrace_mstate_t *, dtrace_vstate_t *);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700499
500/*
501 * DTrace Probe Context Functions
502 *
503 * These functions are called from probe context. Because probe context is
504 * any context in which C may be called, arbitrarily locks may be held,
505 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
506 * As a result, functions called from probe context may only call other DTrace
507 * support functions -- they may not interact at all with the system at large.
508 * (Note that the ASSERT macro is made probe-context safe by redefining it in
509 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
510 * loads are to be performed from probe context, they _must_ be in terms of
511 * the safe dtrace_load*() variants.
512 *
513 * Some functions in this block are not actually called from probe context;
514 * for these functions, there will be a comment above the function reading
515 * "Note: not called from probe context."
516 */
517void
518dtrace_panic(const char *format, ...)
519{
520 va_list alist;
521
522 va_start(alist, format);
523 dtrace_vpanic(format, alist);
524 va_end(alist);
525}
526
527int
528dtrace_assfail(const char *a, const char *f, int l)
529{
530 dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
531
532 /*
533 * We just need something here that even the most clever compiler
534 * cannot optimize away.
535 */
536 return (a[(uintptr_t)f]);
537}
538
539/*
bmca1b5e532005-08-30 14:48:05 -0700540 * Atomically increment a specified error counter from probe context.
541 */
542static void
543dtrace_error(uint32_t *counter)
544{
545 /*
546 * Most counters stored to in probe context are per-CPU counters.
547 * However, there are some error conditions that are sufficiently
548 * arcane that they don't merit per-CPU storage. If these counters
549 * are incremented concurrently on different CPUs, scalability will be
550 * adversely affected -- but we don't expect them to be white-hot in a
551 * correctly constructed enabling...
552 */
553 uint32_t oval, nval;
554
555 do {
556 oval = *counter;
557
558 if ((nval = oval + 1) == 0) {
559 /*
560 * If the counter would wrap, set it to 1 -- assuring
561 * that the counter is never zero when we have seen
562 * errors. (The counter must be 32-bits because we
563 * aren't guaranteed a 64-bit compare&swap operation.)
564 * To save this code both the infamy of being fingered
565 * by a priggish news story and the indignity of being
566 * the target of a neo-puritan witch trial, we're
567 * carefully avoiding any colorful description of the
568 * likelihood of this condition -- but suffice it to
569 * say that it is only slightly more likely than the
570 * overflow of predicate cache IDs, as discussed in
571 * dtrace_predicate_create().
572 */
573 nval = 1;
574 }
575 } while (dtrace_cas32(counter, oval, nval) != oval);
576}
577
578/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700579 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
580 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
581 */
Bryan Cantrill1c0cef62016-02-04 07:26:03 +0000582/* BEGIN CSTYLED */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700583DTRACE_LOADFUNC(8)
584DTRACE_LOADFUNC(16)
585DTRACE_LOADFUNC(32)
586DTRACE_LOADFUNC(64)
Bryan Cantrill1c0cef62016-02-04 07:26:03 +0000587/* END CSTYLED */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700588
589static int
590dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
591{
592 if (dest < mstate->dtms_scratch_base)
593 return (0);
594
595 if (dest + size < dest)
596 return (0);
597
598 if (dest + size > mstate->dtms_scratch_ptr)
599 return (0);
600
601 return (1);
602}
603
604static int
Patrick Mooney771e39c2016-05-31 17:34:15 -0700605dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700606 dtrace_statvar_t **svars, int nsvars)
607{
608 int i;
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700609 size_t maxglobalsize, maxlocalsize;
610
611 if (nsvars == 0)
612 return (0);
613
Bryan Cantrilld65f2bb2016-03-03 00:17:09 +0000614 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
615 maxlocalsize = maxglobalsize * NCPU;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700616
617 for (i = 0; i < nsvars; i++) {
618 dtrace_statvar_t *svar = svars[i];
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700619 uint8_t scope;
620 size_t size;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700621
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700622 if (svar == NULL || (size = svar->dtsv_size) == 0)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700623 continue;
624
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700625 scope = svar->dtsv_var.dtdv_scope;
626
627 /*
628 * We verify that our size is valid in the spirit of providing
629 * defense in depth: we want to prevent attackers from using
630 * DTrace to escalate an orthogonal kernel heap corruption bug
631 * into the ability to store to arbitrary locations in memory.
632 */
Bryan Cantrilld65f2bb2016-03-03 00:17:09 +0000633 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
634 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
Bryan Cantrill395c7a32015-09-29 12:15:25 -0700635
Patrick Mooney771e39c2016-05-31 17:34:15 -0700636 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data,
637 svar->dtsv_size)) {
638 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
639 svar->dtsv_size);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700640 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700641 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700642 }
643
644 return (0);
645}
646
647/*
648 * Check to see if the address is within a memory region to which a store may
649 * be issued. This includes the DTrace scratch areas, and any DTrace variable
650 * region. The caller of dtrace_canstore() is responsible for performing any
651 * alignment checks that are needed before stores are actually executed.
652 */
653static int
654dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
655 dtrace_vstate_t *vstate)
656{
Patrick Mooney771e39c2016-05-31 17:34:15 -0700657 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
658}
659
660/*
661 * Implementation of dtrace_canstore which communicates the upper bound of the
662 * allowed memory region.
663 */
664static int
665dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
666 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
667{
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700668 /*
669 * First, check to see if the address is in scratch space...
670 */
dpe0aad1e2006-10-05 19:20:42 -0700671 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
Patrick Mooney771e39c2016-05-31 17:34:15 -0700672 mstate->dtms_scratch_size)) {
673 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
674 mstate->dtms_scratch_size);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700675 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700676 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700677
678 /*
679 * Now check to see if it's a dynamic variable. This check will pick
680 * up both thread-local variables and any global dynamically-allocated
681 * variables.
682 */
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000683 if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
jhaslamfbcb7db2007-07-19 02:34:11 -0700684 vstate->dtvs_dynvars.dtds_size)) {
685 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
686 uintptr_t base = (uintptr_t)dstate->dtds_base +
687 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
688 uintptr_t chunkoffs;
Bryan Cantrill1c0cef62016-02-04 07:26:03 +0000689 dtrace_dynvar_t *dvar;
jhaslamfbcb7db2007-07-19 02:34:11 -0700690
691 /*
692 * Before we assume that we can store here, we need to make
693 * sure that it isn't in our metadata -- storing to our
694 * dynamic variable metadata would corrupt our state. For
695 * the range to not include any dynamic variable metadata,
696 * it must:
697 *
698 * (1) Start above the hash table that is at the base of
699 * the dynamic variable space
700 *
701 * (2) Have a starting chunk offset that is beyond the
702 * dtrace_dynvar_t that is at the base of every chunk
703 *
704 * (3) Not span a chunk boundary
705 *
Bryan Cantrill1c0cef62016-02-04 07:26:03 +0000706 * (4) Not be in the tuple space of a dynamic variable
707 *
jhaslamfbcb7db2007-07-19 02:34:11 -0700708 */
709 if (addr < base)
710 return (0);
711
712 chunkoffs = (addr - base) % dstate->dtds_chunksize;
713
714 if (chunkoffs < sizeof (dtrace_dynvar_t))
715 return (0);
716
717 if (chunkoffs + sz > dstate->dtds_chunksize)
718 return (0);
719
Bryan Cantrill1c0cef62016-02-04 07:26:03 +0000720 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
721
722 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
723 return (0);
724
725 if (chunkoffs < sizeof (dtrace_dynvar_t) +
726 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
727 return (0);
728
Patrick Mooney771e39c2016-05-31 17:34:15 -0700729 DTRACE_RANGE_REMAIN(remain, addr, dvar, dstate->dtds_chunksize);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700730 return (1);
jhaslamfbcb7db2007-07-19 02:34:11 -0700731 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700732
733 /*
734 * Finally, check the static local and global variables. These checks
735 * take the longest, so we perform them last.
736 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700737 if (dtrace_canstore_statvar(addr, sz, remain,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700738 vstate->dtvs_locals, vstate->dtvs_nlocals))
739 return (1);
740
Patrick Mooney771e39c2016-05-31 17:34:15 -0700741 if (dtrace_canstore_statvar(addr, sz, remain,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700742 vstate->dtvs_globals, vstate->dtvs_nglobals))
743 return (1);
744
745 return (0);
746}
747
dpe0aad1e2006-10-05 19:20:42 -0700748
749/*
750 * Convenience routine to check to see if the address is within a memory
751 * region in which a load may be issued given the user's privilege level;
752 * if not, it sets the appropriate error flags and loads 'addr' into the
753 * illegal value slot.
754 *
755 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
756 * appropriate memory access protection.
757 */
758static int
759dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
760 dtrace_vstate_t *vstate)
761{
Patrick Mooney771e39c2016-05-31 17:34:15 -0700762 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
763}
764
765/*
766 * Implementation of dtrace_canload which communicates the upper bound of the
767 * allowed memory region.
768 */
769static int
770dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
771 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
772{
dpe0aad1e2006-10-05 19:20:42 -0700773 volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000774 file_t *fp;
dpe0aad1e2006-10-05 19:20:42 -0700775
776 /*
777 * If we hold the privilege to read from kernel memory, then
778 * everything is readable.
779 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700780 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
781 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
dpe0aad1e2006-10-05 19:20:42 -0700782 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700783 }
dpe0aad1e2006-10-05 19:20:42 -0700784
785 /*
786 * You can obviously read that which you can store.
787 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700788 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
dpe0aad1e2006-10-05 19:20:42 -0700789 return (1);
790
791 /*
792 * We're allowed to read from our own string table.
793 */
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000794 if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
Patrick Mooney771e39c2016-05-31 17:34:15 -0700795 mstate->dtms_difo->dtdo_strlen)) {
796 DTRACE_RANGE_REMAIN(remain, addr,
797 mstate->dtms_difo->dtdo_strtab,
798 mstate->dtms_difo->dtdo_strlen);
dpe0aad1e2006-10-05 19:20:42 -0700799 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700800 }
dpe0aad1e2006-10-05 19:20:42 -0700801
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000802 if (vstate->dtvs_state != NULL &&
803 dtrace_priv_proc(vstate->dtvs_state, mstate)) {
804 proc_t *p;
805
806 /*
807 * When we have privileges to the current process, there are
808 * several context-related kernel structures that are safe to
809 * read, even absent the privilege to read from kernel memory.
810 * These reads are safe because these structures contain only
811 * state that (1) we're permitted to read, (2) is harmless or
812 * (3) contains pointers to additional kernel state that we're
813 * not permitted to read (and as such, do not present an
814 * opportunity for privilege escalation). Finally (and
815 * critically), because of the nature of their relation with
816 * the current thread context, the memory associated with these
817 * structures cannot change over the duration of probe context,
818 * and it is therefore impossible for this memory to be
819 * deallocated and reallocated as something else while it's
820 * being operated upon.
821 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700822 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t))) {
823 DTRACE_RANGE_REMAIN(remain, addr, curthread,
824 sizeof (kthread_t));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000825 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700826 }
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000827
828 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
829 sz, curthread->t_procp, sizeof (proc_t))) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700830 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_procp,
831 sizeof (proc_t));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000832 return (1);
833 }
834
835 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
836 curthread->t_cred, sizeof (cred_t))) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700837 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cred,
838 sizeof (cred_t));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000839 return (1);
840 }
841
842 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
843 &(p->p_pidp->pid_id), sizeof (pid_t))) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700844 DTRACE_RANGE_REMAIN(remain, addr, &(p->p_pidp->pid_id),
845 sizeof (pid_t));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000846 return (1);
847 }
848
849 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
850 curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700851 DTRACE_RANGE_REMAIN(remain, addr, curthread->t_cpu,
852 offsetof(cpu_t, cpu_pause_thread));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000853 return (1);
854 }
855 }
856
857 if ((fp = mstate->dtms_getf) != NULL) {
858 uintptr_t psz = sizeof (void *);
859 vnode_t *vp;
860 vnodeops_t *op;
861
862 /*
863 * When getf() returns a file_t, the enabling is implicitly
864 * granted the (transient) right to read the returned file_t
865 * as well as the v_path and v_op->vnop_name of the underlying
866 * vnode. These accesses are allowed after a successful
867 * getf() because the members that they refer to cannot change
868 * once set -- and the barrier logic in the kernel's closef()
869 * path assures that the file_t and its referenced vode_t
870 * cannot themselves be stale (that is, it impossible for
871 * either dtms_getf itself or its f_vnode member to reference
872 * freed memory).
873 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700874 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t))) {
875 DTRACE_RANGE_REMAIN(remain, addr, fp, sizeof (file_t));
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000876 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700877 }
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000878
879 if ((vp = fp->f_vnode) != NULL) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700880 size_t slen;
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000881
Patrick Mooney771e39c2016-05-31 17:34:15 -0700882 if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) {
883 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_path,
884 psz);
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000885 return (1);
886 }
887
Patrick Mooney771e39c2016-05-31 17:34:15 -0700888 slen = strlen(vp->v_path) + 1;
889 if (DTRACE_INRANGE(addr, sz, vp->v_path, slen)) {
890 DTRACE_RANGE_REMAIN(remain, addr, vp->v_path,
891 slen);
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000892 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700893 }
894
895 if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz)) {
896 DTRACE_RANGE_REMAIN(remain, addr, &vp->v_op,
897 psz);
898 return (1);
899 }
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000900
901 if ((op = vp->v_op) != NULL &&
902 DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
Patrick Mooney771e39c2016-05-31 17:34:15 -0700903 DTRACE_RANGE_REMAIN(remain, addr,
904 &op->vnop_name, psz);
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000905 return (1);
906 }
907
908 if (op != NULL && op->vnop_name != NULL &&
909 DTRACE_INRANGE(addr, sz, op->vnop_name,
Patrick Mooney771e39c2016-05-31 17:34:15 -0700910 (slen = strlen(op->vnop_name) + 1))) {
911 DTRACE_RANGE_REMAIN(remain, addr,
912 op->vnop_name, slen);
Bryan Cantrillb0f673c2012-06-04 06:54:42 +0000913 return (1);
914 }
915 }
916 }
917
dpe0aad1e2006-10-05 19:20:42 -0700918 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
919 *illval = addr;
920 return (0);
921}
922
923/*
924 * Convenience routine to check to see if a given string is within a memory
925 * region in which a load may be issued given the user's privilege level;
926 * this exists so that we don't need to issue unnecessary dtrace_strlen()
927 * calls in the event that the user has all privileges.
928 */
929static int
Patrick Mooney771e39c2016-05-31 17:34:15 -0700930dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
931 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
dpe0aad1e2006-10-05 19:20:42 -0700932{
Patrick Mooney771e39c2016-05-31 17:34:15 -0700933 size_t rsize;
dpe0aad1e2006-10-05 19:20:42 -0700934
935 /*
936 * If we hold the privilege to read from kernel memory, then
937 * everything is readable.
938 */
Patrick Mooney771e39c2016-05-31 17:34:15 -0700939 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
940 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
dpe0aad1e2006-10-05 19:20:42 -0700941 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -0700942 }
dpe0aad1e2006-10-05 19:20:42 -0700943
Patrick Mooney771e39c2016-05-31 17:34:15 -0700944 /*
945 * Even if the caller is uninterested in querying the remaining valid
946 * range, it is required to ensure that the access is allowed.
947 */
948 if (remain == NULL) {
949 remain = &rsize;
950 }
951 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
952 size_t strsz;
953 /*
954 * Perform the strlen after determining the length of the
955 * memory region which is accessible. This prevents timing
956 * information from being used to find NULs in memory which is
957 * not accessible to the caller.
958 */
959 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
960 MIN(sz, *remain));
961 if (strsz <= *remain) {
962 return (1);
963 }
964 }
dpe0aad1e2006-10-05 19:20:42 -0700965
966 return (0);
967}
968
969/*
970 * Convenience routine to check to see if a given variable is within a memory
971 * region in which a load may be issued given the user's privilege level.
972 */
973static int
Patrick Mooney771e39c2016-05-31 17:34:15 -0700974dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
975 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
dpe0aad1e2006-10-05 19:20:42 -0700976{
977 size_t sz;
978 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
979
980 /*
Patrick Mooney771e39c2016-05-31 17:34:15 -0700981 * Calculate the max size before performing any checks since even
982 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
983 * return the max length via 'remain'.
984 */
985 if (type->dtdt_kind == DIF_TYPE_STRING) {
986 dtrace_state_t *state = vstate->dtvs_state;
987
988 if (state != NULL) {
989 sz = state->dts_options[DTRACEOPT_STRSIZE];
990 } else {
991 /*
992 * In helper context, we have a NULL state; fall back
993 * to using the system-wide default for the string size
994 * in this case.
995 */
996 sz = dtrace_strsize_default;
997 }
998 } else {
999 sz = type->dtdt_size;
1000 }
1001
1002 /*
dpe0aad1e2006-10-05 19:20:42 -07001003 * If we hold the privilege to read from kernel memory, then
1004 * everything is readable.
1005 */
Patrick Mooney771e39c2016-05-31 17:34:15 -07001006 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1007 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
dpe0aad1e2006-10-05 19:20:42 -07001008 return (1);
Patrick Mooney771e39c2016-05-31 17:34:15 -07001009 }
dpe0aad1e2006-10-05 19:20:42 -07001010
Patrick Mooney771e39c2016-05-31 17:34:15 -07001011 if (type->dtdt_kind == DIF_TYPE_STRING) {
1012 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1013 vstate));
1014 }
1015 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1016 vstate));
dpe0aad1e2006-10-05 19:20:42 -07001017}
1018
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001019/*
Joshua M. Clulowf497f9f2012-11-27 08:55:56 +00001020 * Convert a string to a signed integer using safe loads.
1021 *
1022 * NOTE: This function uses various macros from strtolctype.h to manipulate
1023 * digit values, etc -- these have all been checked to ensure they make
1024 * no additional function calls.
1025 */
1026static int64_t
1027dtrace_strtoll(char *input, int base, size_t limit)
1028{
1029 uintptr_t pos = (uintptr_t)input;
1030 int64_t val = 0;
1031 int x;
1032 boolean_t neg = B_FALSE;
1033 char c, cc, ccc;
1034 uintptr_t end = pos + limit;
1035
1036 /*
1037 * Consume any whitespace preceding digits.
1038 */
1039 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1040 pos++;
1041
1042 /*
1043 * Handle an explicit sign if one is present.
1044 */
1045 if (c == '-' || c == '+') {
1046 if (c == '-')
1047 neg = B_TRUE;
1048 c = dtrace_load8(++pos);
1049 }
1050
1051 /*
1052 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1053 * if present.
1054 */
1055 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1056 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1057 pos += 2;
1058 c = ccc;
1059 }
1060
1061 /*
1062 * Read in contiguous digits until the first non-digit character.
1063 */
1064 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1065 c = dtrace_load8(++pos))
1066 val = val * base + x;
1067
1068 return (neg ? -val : val);
1069}
1070
1071/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001072 * Compare two strings using safe loads.
1073 */
1074static int
1075dtrace_strncmp(char *s1, char *s2, size_t limit)
1076{
1077 uint8_t c1, c2;
1078 volatile uint16_t *flags;
1079
1080 if (s1 == s2 || limit == 0)
1081 return (0);
1082
1083 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1084
1085 do {
1086 if (s1 == NULL) {
1087 c1 = '\0';
1088 } else {
1089 c1 = dtrace_load8((uintptr_t)s1++);
1090 }
1091
1092 if (s2 == NULL) {
1093 c2 = '\0';
1094 } else {
1095 c2 = dtrace_load8((uintptr_t)s2++);
1096 }
1097
1098 if (c1 != c2)
1099 return (c1 - c2);
1100 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1101
1102 return (0);
1103}
1104
1105/*
1106 * Compute strlen(s) for a string using safe memory accesses. The additional
1107 * len parameter is used to specify a maximum length to ensure completion.
1108 */
1109static size_t
1110dtrace_strlen(const char *s, size_t lim)
1111{
1112 uint_t len;
1113
1114 for (len = 0; len != lim; len++) {
1115 if (dtrace_load8((uintptr_t)s++) == '\0')
1116 break;
1117 }
1118
1119 return (len);
1120}
1121
1122/*
1123 * Check if an address falls within a toxic region.
1124 */
1125static int
1126dtrace_istoxic(uintptr_t kaddr, size_t size)
1127{
1128 uintptr_t taddr, tsize;
1129 int i;
1130
1131 for (i = 0; i < dtrace_toxranges; i++) {
1132 taddr = dtrace_toxrange[i].dtt_base;
1133 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1134
1135 if (kaddr - taddr < tsize) {
1136 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1137 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1138 return (1);
1139 }
1140
1141 if (taddr - kaddr < size) {
1142 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1143 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1144 return (1);
1145 }
1146 }
1147
1148 return (0);
1149}
1150
1151/*
1152 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1153 * memory specified by the DIF program. The dst is assumed to be safe memory
1154 * that we can store to directly because it is managed by DTrace. As with
1155 * standard bcopy, overlapping copies are handled properly.
1156 */
1157static void
1158dtrace_bcopy(const void *src, void *dst, size_t len)
1159{
1160 if (len != 0) {
1161 uint8_t *s1 = dst;
1162 const uint8_t *s2 = src;
1163
1164 if (s1 <= s2) {
1165 do {
1166 *s1++ = dtrace_load8((uintptr_t)s2++);
1167 } while (--len != 0);
1168 } else {
1169 s2 += len;
1170 s1 += len;
1171
1172 do {
1173 *--s1 = dtrace_load8((uintptr_t)--s2);
1174 } while (--len != 0);
1175 }
1176 }
1177}
1178
1179/*
1180 * Copy src to dst using safe memory accesses, up to either the specified
1181 * length, or the point that a nul byte is encountered. The src is assumed to
1182 * be unsafe memory specified by the DIF program. The dst is assumed to be
1183 * safe memory that we can store to directly because it is managed by DTrace.
1184 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1185 */
1186static void
1187dtrace_strcpy(const void *src, void *dst, size_t len)
1188{
1189 if (len != 0) {
1190 uint8_t *s1 = dst, c;
1191 const uint8_t *s2 = src;
1192
1193 do {
1194 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1195 } while (--len != 0 && c != '\0');
1196 }
1197}
1198
1199/*
1200 * Copy src to dst, deriving the size and type from the specified (BYREF)
1201 * variable type. The src is assumed to be unsafe memory specified by the DIF
1202 * program. The dst is assumed to be DTrace variable memory that is of the
1203 * specified type; we assume that we can store to directly.
1204 */
1205static void
Patrick Mooney771e39c2016-05-31 17:34:15 -07001206dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001207{
1208 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1209
1210 if (type->dtdt_kind == DIF_TYPE_STRING) {
Patrick Mooney771e39c2016-05-31 17:34:15 -07001211 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001212 } else {
Patrick Mooney771e39c2016-05-31 17:34:15 -07001213 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001214 }
1215}
1216
1217/*
1218 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1219 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1220 * safe memory that we can access directly because it is managed by DTrace.
1221 */
1222static int
1223dtrace_bcmp(const void *s1, const void *s2, size_t len)
1224{
1225 volatile uint16_t *flags;
1226
1227 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1228
1229 if (s1 == s2)
1230 return (0);
1231
1232 if (s1 == NULL || s2 == NULL)
1233 return (1);
1234
1235 if (s1 != s2 && len != 0) {
1236 const uint8_t *ps1 = s1;
1237 const uint8_t *ps2 = s2;
1238
1239 do {
1240 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1241 return (1);
1242 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1243 }
1244 return (0);
1245}
1246
1247/*
1248 * Zero the specified region using a simple byte-by-byte loop. Note that this
1249 * is for safe DTrace-managed memory only.
1250 */
1251static void
1252dtrace_bzero(void *dst, size_t len)
1253{
1254 uchar_t *cp;
1255
1256 for (cp = dst; len != 0; len--)
1257 *cp++ = 0;
1258}
1259
jhaslam6e0bee72008-02-07 06:05:33 -08001260static void
1261dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1262{
1263 uint64_t result[2];
1264
1265 result[0] = addend1[0] + addend2[0];
1266 result[1] = addend1[1] + addend2[1] +
1267 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1268
1269 sum[0] = result[0];
1270 sum[1] = result[1];
1271}
1272
1273/*
1274 * Shift the 128-bit value in a by b. If b is positive, shift left.
1275 * If b is negative, shift right.
1276 */
1277static void
1278dtrace_shift_128(uint64_t *a, int b)
1279{
1280 uint64_t mask;
1281
1282 if (b == 0)
1283 return;
1284
1285 if (b < 0) {
1286 b = -b;
1287 if (b >= 64) {
1288 a[0] = a[1] >> (b - 64);
1289 a[1] = 0;
1290 } else {
1291 a[0] >>= b;
1292 mask = 1LL << (64 - b);
1293 mask -= 1;
1294 a[0] |= ((a[1] & mask) << (64 - b));
1295 a[1] >>= b;
1296 }
1297 } else {
1298 if (b >= 64) {
1299 a[1] = a[0] << (b - 64);
1300 a[0] = 0;
1301 } else {
1302 a[1] <<= b;
1303 mask = a[0] >> (64 - b);
1304 a[1] |= mask;
1305 a[0] <<= b;
1306 }
1307 }
1308}
1309
1310/*
1311 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1312 * use native multiplication on those, and then re-combine into the
1313 * resulting 128-bit value.
1314 *
1315 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1316 * hi1 * hi2 << 64 +
1317 * hi1 * lo2 << 32 +
1318 * hi2 * lo1 << 32 +
1319 * lo1 * lo2
1320 */
1321static void
1322dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1323{
1324 uint64_t hi1, hi2, lo1, lo2;
1325 uint64_t tmp[2];
1326
1327 hi1 = factor1 >> 32;
1328 hi2 = factor2 >> 32;
1329
1330 lo1 = factor1 & DT_MASK_LO;
1331 lo2 = factor2 & DT_MASK_LO;
1332
1333 product[0] = lo1 * lo2;
1334 product[1] = hi1 * hi2;
1335
1336 tmp[0] = hi1 * lo2;
1337 tmp[1] = 0;
1338 dtrace_shift_128(tmp, 32);
1339 dtrace_add_128(product, tmp, product);
1340
1341 tmp[0] = hi2 * lo1;
1342 tmp[1] = 0;
1343 dtrace_shift_128(tmp, 32);
1344 dtrace_add_128(product, tmp, product);
1345}
1346
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001347/*
dpad4023c2006-03-24 18:42:51 -08001348 * This privilege check should be used by actions and subroutines to
1349 * verify that the user credentials of the process that enabled the
1350 * invoking ECB match the target credentials
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001351 */
1352static int
dpad4023c2006-03-24 18:42:51 -08001353dtrace_priv_proc_common_user(dtrace_state_t *state)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001354{
dpad4023c2006-03-24 18:42:51 -08001355 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1356
1357 /*
1358 * We should always have a non-NULL state cred here, since if cred
1359 * is null (anonymous tracing), we fast-path bypass this routine.
1360 */
1361 ASSERT(s_cr != NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001362
1363 if ((cr = CRED()) != NULL &&
dpad4023c2006-03-24 18:42:51 -08001364 s_cr->cr_uid == cr->cr_uid &&
1365 s_cr->cr_uid == cr->cr_ruid &&
1366 s_cr->cr_uid == cr->cr_suid &&
1367 s_cr->cr_gid == cr->cr_gid &&
1368 s_cr->cr_gid == cr->cr_rgid &&
1369 s_cr->cr_gid == cr->cr_sgid)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001370 return (1);
1371
dpad4023c2006-03-24 18:42:51 -08001372 return (0);
1373}
1374
1375/*
1376 * This privilege check should be used by actions and subroutines to
1377 * verify that the zone of the process that enabled the invoking ECB
1378 * matches the target credentials
1379 */
1380static int
1381dtrace_priv_proc_common_zone(dtrace_state_t *state)
1382{
1383 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1384
1385 /*
1386 * We should always have a non-NULL state cred here, since if cred
1387 * is null (anonymous tracing), we fast-path bypass this routine.
1388 */
1389 ASSERT(s_cr != NULL);
1390
Bryan Cantrillb0f673c2012-06-04 06:54:42 +00001391 if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
dpad4023c2006-03-24 18:42:51 -08001392 return (1);
1393
1394 return (0);
1395}
1396
1397/*
1398 * This privilege check should be used by actions and subroutines to
1399 * verify that the process has not setuid or changed credentials.
1400 */
1401static int
1402dtrace_priv_proc_common_nocd()
1403{
1404 proc_t *proc;
1405
1406 if ((proc = ttoproc(curthread)) != NULL &&
1407 !(proc->p_flag & SNOCD))
1408 return (1);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001409
1410 return (0);
1411}
1412
1413static int
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001414dtrace_priv_proc_destructive(dtrace_state_t *state, dtrace_mstate_t *mstate)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001415{
dpad4023c2006-03-24 18:42:51 -08001416 int action = state->dts_cred.dcr_action;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001417
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001418 if (!(mstate->dtms_access & DTRACE_ACCESS_PROC))
1419 goto bad;
1420
dpad4023c2006-03-24 18:42:51 -08001421 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1422 dtrace_priv_proc_common_zone(state) == 0)
1423 goto bad;
1424
1425 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1426 dtrace_priv_proc_common_user(state) == 0)
1427 goto bad;
1428
1429 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1430 dtrace_priv_proc_common_nocd() == 0)
1431 goto bad;
1432
1433 return (1);
1434
1435bad:
1436 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1437
1438 return (0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001439}
1440
1441static int
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001442dtrace_priv_proc_control(dtrace_state_t *state, dtrace_mstate_t *mstate)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001443{
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001444 if (mstate->dtms_access & DTRACE_ACCESS_PROC) {
1445 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1446 return (1);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001447
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001448 if (dtrace_priv_proc_common_zone(state) &&
1449 dtrace_priv_proc_common_user(state) &&
1450 dtrace_priv_proc_common_nocd())
1451 return (1);
1452 }
dpad4023c2006-03-24 18:42:51 -08001453
1454 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1455
1456 return (0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001457}
1458
1459static int
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001460dtrace_priv_proc(dtrace_state_t *state, dtrace_mstate_t *mstate)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001461{
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001462 if ((mstate->dtms_access & DTRACE_ACCESS_PROC) &&
1463 (state->dts_cred.dcr_action & DTRACE_CRA_PROC))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001464 return (1);
1465
1466 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1467
1468 return (0);
1469}
1470
1471static int
1472dtrace_priv_kernel(dtrace_state_t *state)
1473{
1474 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1475 return (1);
1476
1477 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1478
1479 return (0);
1480}
1481
1482static int
1483dtrace_priv_kernel_destructive(dtrace_state_t *state)
1484{
1485 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1486 return (1);
1487
1488 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1489
1490 return (0);
1491}
1492
1493/*
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001494 * Determine if the dte_cond of the specified ECB allows for processing of
1495 * the current probe to continue. Note that this routine may allow continued
1496 * processing, but with access(es) stripped from the mstate's dtms_access
1497 * field.
1498 */
1499static int
1500dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1501 dtrace_ecb_t *ecb)
1502{
1503 dtrace_probe_t *probe = ecb->dte_probe;
1504 dtrace_provider_t *prov = probe->dtpr_provider;
1505 dtrace_pops_t *pops = &prov->dtpv_pops;
1506 int mode = DTRACE_MODE_NOPRIV_DROP;
1507
1508 ASSERT(ecb->dte_cond);
1509
1510 if (pops->dtps_mode != NULL) {
1511 mode = pops->dtps_mode(prov->dtpv_arg,
1512 probe->dtpr_id, probe->dtpr_arg);
1513
Bryan Cantrillb0f673c2012-06-04 06:54:42 +00001514 ASSERT(mode & (DTRACE_MODE_USER | DTRACE_MODE_KERNEL));
1515 ASSERT(mode & (DTRACE_MODE_NOPRIV_RESTRICT |
1516 DTRACE_MODE_NOPRIV_DROP));
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001517 }
1518
1519 /*
1520 * If the dte_cond bits indicate that this consumer is only allowed to
Bryan Cantrillb0f673c2012-06-04 06:54:42 +00001521 * see user-mode firings of this probe, check that the probe was fired
1522 * while in a user context. If that's not the case, use the policy
1523 * specified by the provider to determine if we drop the probe or
1524 * merely restrict operation.
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001525 */
1526 if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1527 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1528
1529 if (!(mode & DTRACE_MODE_USER)) {
1530 if (mode & DTRACE_MODE_NOPRIV_DROP)
1531 return (0);
1532
1533 mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1534 }
1535 }
1536
1537 /*
1538 * This is more subtle than it looks. We have to be absolutely certain
1539 * that CRED() isn't going to change out from under us so it's only
1540 * legit to examine that structure if we're in constrained situations.
1541 * Currently, the only times we'll this check is if a non-super-user
1542 * has enabled the profile or syscall providers -- providers that
1543 * allow visibility of all processes. For the profile case, the check
1544 * above will ensure that we're examining a user context.
1545 */
1546 if (ecb->dte_cond & DTRACE_COND_OWNER) {
1547 cred_t *cr;
1548 cred_t *s_cr = state->dts_cred.dcr_cred;
1549 proc_t *proc;
1550
1551 ASSERT(s_cr != NULL);
1552
1553 if ((cr = CRED()) == NULL ||
1554 s_cr->cr_uid != cr->cr_uid ||
1555 s_cr->cr_uid != cr->cr_ruid ||
1556 s_cr->cr_uid != cr->cr_suid ||
1557 s_cr->cr_gid != cr->cr_gid ||
1558 s_cr->cr_gid != cr->cr_rgid ||
1559 s_cr->cr_gid != cr->cr_sgid ||
1560 (proc = ttoproc(curthread)) == NULL ||
1561 (proc->p_flag & SNOCD)) {
1562 if (mode & DTRACE_MODE_NOPRIV_DROP)
1563 return (0);
1564
1565 mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1566 }
1567 }
1568
1569 /*
1570 * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1571 * in our zone, check to see if our mode policy is to restrict rather
1572 * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1573 * and DTRACE_ACCESS_ARGS
1574 */
1575 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1576 cred_t *cr;
1577 cred_t *s_cr = state->dts_cred.dcr_cred;
1578
1579 ASSERT(s_cr != NULL);
1580
1581 if ((cr = CRED()) == NULL ||
1582 s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1583 if (mode & DTRACE_MODE_NOPRIV_DROP)
1584 return (0);
1585
1586 mstate->dtms_access &=
1587 ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1588 }
1589 }
1590
Bryan Cantrillb0f673c2012-06-04 06:54:42 +00001591 /*
1592 * By merits of being in this code path at all, we have limited
1593 * privileges. If the provider has indicated that limited privileges
1594 * are to denote restricted operation, strip off the ability to access
1595 * arguments.
1596 */
1597 if (mode & DTRACE_MODE_LIMITEDPRIV_RESTRICT)
1598 mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1599
Bryan Cantrill7d5c9b52011-07-06 00:27:16 -07001600 return (1);
1601}
1602
1603/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001604 * Note: not called from probe context. This function is called
1605 * asynchronously (and at a regular interval) from outside of probe context to
1606 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1607 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1608 */
1609void
1610dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1611{
1612 dtrace_dynvar_t *dirty;
1613 dtrace_dstate_percpu_t *dcpu;
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07001614 dtrace_dynvar_t **rinsep;
1615 int i, j, work = 0;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001616
1617 for (i = 0; i < NCPU; i++) {
1618 dcpu = &dstate->dtds_percpu[i];
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07001619 rinsep = &dcpu->dtdsc_rinsing;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001620
1621 /*
1622 * If the dirty list is NULL, there is no dirty work to do.
1623 */
1624 if (dcpu->dtdsc_dirty == NULL)
1625 continue;
1626
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07001627 if (dcpu->dtdsc_rinsing != NULL) {
1628 /*
1629 * If the rinsing list is non-NULL, then it is because
1630 * this CPU was selected to accept another CPU's
1631 * dirty list -- and since that time, dirty buffers
1632 * have accumulated. This is a highly unlikely
1633 * condition, but we choose to ignore the dirty
1634 * buffers -- they'll be picked up a future cleanse.
1635 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001636 continue;
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07001637 }
1638
1639 if (dcpu->dtdsc_clean != NULL) {
1640 /*
1641 * If the clean list is non-NULL, then we're in a
1642 * situation where a CPU has done deallocations (we
1643 * have a non-NULL dirty list) but no allocations (we
1644 * also have a non-NULL clean list). We can't simply
1645 * move the dirty list into the clean list on this
1646 * CPU, yet we also don't want to allow this condition
1647 * to persist, lest a short clean list prevent a
1648 * massive dirty list from being cleaned (which in
1649 * turn could lead to otherwise avoidable dynamic
1650 * drops). To deal with this, we look for some CPU
1651 * with a NULL clean list, NULL dirty list, and NULL
1652 * rinsing list -- and then we borrow this CPU to
1653 * rinse our dirty list.
1654 */
1655 for (j = 0; j < NCPU; j++) {
1656 dtrace_dstate_percpu_t *rinser;
1657
1658 rinser = &dstate->dtds_percpu[j];
1659
1660 if (rinser->dtdsc_rinsing != NULL)
1661 continue;
1662
1663 if (rinser->dtdsc_dirty != NULL)
1664 continue;
1665
1666 if (rinser->dtdsc_clean != NULL)
1667 continue;
1668
1669 rinsep = &rinser->dtdsc_rinsing;
1670 break;
1671 }
1672
1673 if (j == NCPU) {
1674 /*
1675 * We were unable to find another CPU that
1676 * could accept this dirty list -- we are
1677 * therefore unable to clean it now.
1678 */
1679 dtrace_dynvar_failclean++;
1680 continue;
1681 }
1682 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001683
1684 work = 1;
1685
1686 /*
1687 * Atomically move the dirty list aside.
1688 */
1689 do {
1690 dirty = dcpu->dtdsc_dirty;
1691
1692 /*
1693 * Before we zap the dirty list, set the rinsing list.
1694 * (This allows for a potential assertion in
1695 * dtrace_dynvar(): if a free dynamic variable appears
1696 * on a hash chain, either the dirty list or the
1697 * rinsing list for some CPU must be non-NULL.)
1698 */
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07001699 *rinsep = dirty;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001700 dtrace_membar_producer();
1701 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1702 dirty, NULL) != dirty);
1703 }
1704
1705 if (!work) {
1706 /*
1707 * We have no work to do; we can simply return.
1708 */
1709 return;
1710 }
1711
1712 dtrace_sync();
1713
1714 for (i = 0; i < NCPU; i++) {
1715 dcpu = &dstate->dtds_percpu[i];
1716
1717 if (dcpu->dtdsc_rinsing == NULL)
1718 continue;
1719
1720 /*
1721 * We are now guaranteed that no hash chain contains a pointer
1722 * into this dirty list; we can make it clean.
1723 */
1724 ASSERT(dcpu->dtdsc_clean == NULL);
1725 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1726 dcpu->dtdsc_rinsing = NULL;
1727 }
1728
1729 /*
1730 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1731 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1732 * This prevents a race whereby a CPU incorrectly decides that
1733 * the state should be something other than DTRACE_DSTATE_CLEAN
1734 * after dtrace_dynvar_clean() has completed.
1735 */
1736 dtrace_sync();
1737
1738 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1739}
1740
1741/*
1742 * Depending on the value of the op parameter, this function looks-up,
1743 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1744 * allocation is requested, this function will return a pointer to a
1745 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1746 * variable can be allocated. If NULL is returned, the appropriate counter
1747 * will be incremented.
1748 */
1749dtrace_dynvar_t *
1750dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
dpe0aad1e2006-10-05 19:20:42 -07001751 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1752 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001753{
bmc586d07d2006-04-03 10:37:01 -07001754 uint64_t hashval = DTRACE_DYNHASH_VALID;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001755 dtrace_dynhash_t *hash = dstate->dtds_hash;
1756 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1757 processorid_t me = CPU->cpu_id, cpu = me;
1758 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1759 size_t bucket, ksize;
1760 size_t chunksize = dstate->dtds_chunksize;
1761 uintptr_t kdata, lock, nstate;
1762 uint_t i;
1763
1764 ASSERT(nkeys != 0);
1765
1766 /*
1767 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1768 * algorithm. For the by-value portions, we perform the algorithm in
1769 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1770 * bit, and seems to have only a minute effect on distribution. For
1771 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1772 * over each referenced byte. It's painful to do this, but it's much
1773 * better than pathological hash distribution. The efficacy of the
1774 * hashing algorithm (and a comparison with other algorithms) may be
1775 * found by running the ::dtrace_dynstat MDB dcmd.
1776 */
1777 for (i = 0; i < nkeys; i++) {
1778 if (key[i].dttk_size == 0) {
1779 uint64_t val = key[i].dttk_value;
1780
1781 hashval += (val >> 48) & 0xffff;
1782 hashval += (hashval << 10);
1783 hashval ^= (hashval >> 6);
1784
1785 hashval += (val >> 32) & 0xffff;
1786 hashval += (hashval << 10);
1787 hashval ^= (hashval >> 6);
1788
1789 hashval += (val >> 16) & 0xffff;
1790 hashval += (hashval << 10);
1791 hashval ^= (hashval >> 6);
1792
1793 hashval += val & 0xffff;
1794 hashval += (hashval << 10);
1795 hashval ^= (hashval >> 6);
1796 } else {
1797 /*
1798 * This is incredibly painful, but it beats the hell
1799 * out of the alternative.
1800 */
1801 uint64_t j, size = key[i].dttk_size;
1802 uintptr_t base = (uintptr_t)key[i].dttk_value;
1803
dpe0aad1e2006-10-05 19:20:42 -07001804 if (!dtrace_canload(base, size, mstate, vstate))
1805 break;
1806
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001807 for (j = 0; j < size; j++) {
1808 hashval += dtrace_load8(base + j);
1809 hashval += (hashval << 10);
1810 hashval ^= (hashval >> 6);
1811 }
1812 }
1813 }
1814
dpe0aad1e2006-10-05 19:20:42 -07001815 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1816 return (NULL);
1817
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001818 hashval += (hashval << 3);
1819 hashval ^= (hashval >> 11);
1820 hashval += (hashval << 15);
1821
1822 /*
bmc586d07d2006-04-03 10:37:01 -07001823 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1824 * comes out to be one of our two sentinel hash values. If this
1825 * actually happens, we set the hashval to be a value known to be a
1826 * non-sentinel value.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001827 */
bmc586d07d2006-04-03 10:37:01 -07001828 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1829 hashval = DTRACE_DYNHASH_VALID;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001830
1831 /*
1832 * Yes, it's painful to do a divide here. If the cycle count becomes
1833 * important here, tricks can be pulled to reduce it. (However, it's
1834 * critical that hash collisions be kept to an absolute minimum;
1835 * they're much more painful than a divide.) It's better to have a
1836 * solution that generates few collisions and still keeps things
1837 * relatively simple.
1838 */
1839 bucket = hashval % dstate->dtds_hashsize;
1840
1841 if (op == DTRACE_DYNVAR_DEALLOC) {
1842 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1843
1844 for (;;) {
1845 while ((lock = *lockp) & 1)
1846 continue;
1847
1848 if (dtrace_casptr((void *)lockp,
1849 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1850 break;
1851 }
1852
1853 dtrace_membar_producer();
1854 }
1855
1856top:
1857 prev = NULL;
1858 lock = hash[bucket].dtdh_lock;
1859
1860 dtrace_membar_consumer();
1861
1862 start = hash[bucket].dtdh_chain;
bmc586d07d2006-04-03 10:37:01 -07001863 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1864 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1865 op != DTRACE_DYNVAR_DEALLOC));
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001866
1867 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1868 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1869 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1870
1871 if (dvar->dtdv_hashval != hashval) {
bmc586d07d2006-04-03 10:37:01 -07001872 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001873 /*
bmc586d07d2006-04-03 10:37:01 -07001874 * We've reached the sink, and therefore the
1875 * end of the hash chain; we can kick out of
1876 * the loop knowing that we have seen a valid
1877 * snapshot of state.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001878 */
bmc586d07d2006-04-03 10:37:01 -07001879 ASSERT(dvar->dtdv_next == NULL);
1880 ASSERT(dvar == &dtrace_dynhash_sink);
1881 break;
1882 }
1883
1884 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1885 /*
1886 * We've gone off the rails: somewhere along
1887 * the line, one of the members of this hash
1888 * chain was deleted. Note that we could also
1889 * detect this by simply letting this loop run
1890 * to completion, as we would eventually hit
1891 * the end of the dirty list. However, we
1892 * want to avoid running the length of the
1893 * dirty list unnecessarily (it might be quite
1894 * long), so we catch this as early as
1895 * possible by detecting the hash marker. In
1896 * this case, we simply set dvar to NULL and
1897 * break; the conditional after the loop will
1898 * send us back to top.
1899 */
1900 dvar = NULL;
1901 break;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001902 }
1903
1904 goto next;
1905 }
1906
1907 if (dtuple->dtt_nkeys != nkeys)
1908 goto next;
1909
1910 for (i = 0; i < nkeys; i++, dkey++) {
1911 if (dkey->dttk_size != key[i].dttk_size)
1912 goto next; /* size or type mismatch */
1913
1914 if (dkey->dttk_size != 0) {
1915 if (dtrace_bcmp(
1916 (void *)(uintptr_t)key[i].dttk_value,
1917 (void *)(uintptr_t)dkey->dttk_value,
1918 dkey->dttk_size))
1919 goto next;
1920 } else {
1921 if (dkey->dttk_value != key[i].dttk_value)
1922 goto next;
1923 }
1924 }
1925
1926 if (op != DTRACE_DYNVAR_DEALLOC)
1927 return (dvar);
1928
1929 ASSERT(dvar->dtdv_next == NULL ||
bmc586d07d2006-04-03 10:37:01 -07001930 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001931
1932 if (prev != NULL) {
1933 ASSERT(hash[bucket].dtdh_chain != dvar);
1934 ASSERT(start != dvar);
1935 ASSERT(prev->dtdv_next == dvar);
1936 prev->dtdv_next = dvar->dtdv_next;
1937 } else {
1938 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1939 start, dvar->dtdv_next) != start) {
1940 /*
1941 * We have failed to atomically swing the
1942 * hash table head pointer, presumably because
1943 * of a conflicting allocation on another CPU.
1944 * We need to reread the hash chain and try
1945 * again.
1946 */
1947 goto top;
1948 }
1949 }
1950
1951 dtrace_membar_producer();
1952
1953 /*
bmc586d07d2006-04-03 10:37:01 -07001954 * Now set the hash value to indicate that it's free.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001955 */
1956 ASSERT(hash[bucket].dtdh_chain != dvar);
bmc586d07d2006-04-03 10:37:01 -07001957 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001958
1959 dtrace_membar_producer();
1960
1961 /*
1962 * Set the next pointer to point at the dirty list, and
1963 * atomically swing the dirty pointer to the newly freed dvar.
1964 */
1965 do {
1966 next = dcpu->dtdsc_dirty;
1967 dvar->dtdv_next = next;
1968 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1969
1970 /*
1971 * Finally, unlock this hash bucket.
1972 */
1973 ASSERT(hash[bucket].dtdh_lock == lock);
1974 ASSERT(lock & 1);
1975 hash[bucket].dtdh_lock++;
1976
1977 return (NULL);
1978next:
1979 prev = dvar;
1980 continue;
1981 }
1982
bmc586d07d2006-04-03 10:37:01 -07001983 if (dvar == NULL) {
1984 /*
1985 * If dvar is NULL, it is because we went off the rails:
1986 * one of the elements that we traversed in the hash chain
1987 * was deleted while we were traversing it. In this case,
1988 * we assert that we aren't doing a dealloc (deallocs lock
1989 * the hash bucket to prevent themselves from racing with
1990 * one another), and retry the hash chain traversal.
1991 */
1992 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1993 goto top;
1994 }
1995
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001996 if (op != DTRACE_DYNVAR_ALLOC) {
1997 /*
1998 * If we are not to allocate a new variable, we want to
1999 * return NULL now. Before we return, check that the value
2000 * of the lock word hasn't changed. If it has, we may have
2001 * seen an inconsistent snapshot.
2002 */
2003 if (op == DTRACE_DYNVAR_NOALLOC) {
2004 if (hash[bucket].dtdh_lock != lock)
2005 goto top;
2006 } else {
2007 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2008 ASSERT(hash[bucket].dtdh_lock == lock);
2009 ASSERT(lock & 1);
2010 hash[bucket].dtdh_lock++;
2011 }
2012
2013 return (NULL);
2014 }
2015
2016 /*
2017 * We need to allocate a new dynamic variable. The size we need is the
2018 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2019 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2020 * the size of any referred-to data (dsize). We then round the final
2021 * size up to the chunksize for allocation.
2022 */
2023 for (ksize = 0, i = 0; i < nkeys; i++)
2024 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2025
2026 /*
2027 * This should be pretty much impossible, but could happen if, say,
2028 * strange DIF specified the tuple. Ideally, this should be an
2029 * assertion and not an error condition -- but that requires that the
2030 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2031 * bullet-proof. (That is, it must not be able to be fooled by
2032 * malicious DIF.) Given the lack of backwards branches in DIF,
2033 * solving this would presumably not amount to solving the Halting
2034 * Problem -- but it still seems awfully hard.
2035 */
2036 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2037 ksize + dsize > chunksize) {
2038 dcpu->dtdsc_drops++;
2039 return (NULL);
2040 }
2041
2042 nstate = DTRACE_DSTATE_EMPTY;
2043
2044 do {
2045retry:
2046 free = dcpu->dtdsc_free;
2047
2048 if (free == NULL) {
2049 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2050 void *rval;
2051
2052 if (clean == NULL) {
2053 /*
2054 * We're out of dynamic variable space on
2055 * this CPU. Unless we have tried all CPUs,
2056 * we'll try to allocate from a different
2057 * CPU.
2058 */
2059 switch (dstate->dtds_state) {
2060 case DTRACE_DSTATE_CLEAN: {
2061 void *sp = &dstate->dtds_state;
2062
2063 if (++cpu >= NCPU)
2064 cpu = 0;
2065
2066 if (dcpu->dtdsc_dirty != NULL &&
2067 nstate == DTRACE_DSTATE_EMPTY)
2068 nstate = DTRACE_DSTATE_DIRTY;
2069
2070 if (dcpu->dtdsc_rinsing != NULL)
2071 nstate = DTRACE_DSTATE_RINSING;
2072
2073 dcpu = &dstate->dtds_percpu[cpu];
2074
2075 if (cpu != me)
2076 goto retry;
2077
2078 (void) dtrace_cas32(sp,
2079 DTRACE_DSTATE_CLEAN, nstate);
2080
2081 /*
2082 * To increment the correct bean
2083 * counter, take another lap.
2084 */
2085 goto retry;
2086 }
2087
2088 case DTRACE_DSTATE_DIRTY:
2089 dcpu->dtdsc_dirty_drops++;
2090 break;
2091
2092 case DTRACE_DSTATE_RINSING:
2093 dcpu->dtdsc_rinsing_drops++;
2094 break;
2095
2096 case DTRACE_DSTATE_EMPTY:
2097 dcpu->dtdsc_drops++;
2098 break;
2099 }
2100
2101 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2102 return (NULL);
2103 }
2104
2105 /*
2106 * The clean list appears to be non-empty. We want to
2107 * move the clean list to the free list; we start by
2108 * moving the clean pointer aside.
2109 */
2110 if (dtrace_casptr(&dcpu->dtdsc_clean,
2111 clean, NULL) != clean) {
2112 /*
2113 * We are in one of two situations:
2114 *
2115 * (a) The clean list was switched to the
2116 * free list by another CPU.
2117 *
2118 * (b) The clean list was added to by the
2119 * cleansing cyclic.
2120 *
2121 * In either of these situations, we can
2122 * just reattempt the free list allocation.
2123 */
2124 goto retry;
2125 }
2126
bmc586d07d2006-04-03 10:37:01 -07002127 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002128
2129 /*
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07002130 * Now we'll move the clean list to our free list.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002131 * It's impossible for this to fail: the only way
2132 * the free list can be updated is through this
2133 * code path, and only one CPU can own the clean list.
2134 * Thus, it would only be possible for this to fail if
2135 * this code were racing with dtrace_dynvar_clean().
2136 * (That is, if dtrace_dynvar_clean() updated the clean
2137 * list, and we ended up racing to update the free
2138 * list.) This race is prevented by the dtrace_sync()
2139 * in dtrace_dynvar_clean() -- which flushes the
2140 * owners of the clean lists out before resetting
2141 * the clean lists.
2142 */
Bryan Cantrillc9a6ea22010-07-23 17:34:02 -07002143 dcpu = &dstate->dtds_percpu[me];
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002144 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2145 ASSERT(rval == NULL);
2146 goto retry;
2147 }
2148
2149 dvar = free;
2150 new_free = dvar->dtdv_next;
2151 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2152
2153 /*
2154 * We have now allocated a new chunk. We copy the tuple keys into the
2155 * tuple array and copy any referenced key data into the data space
2156 * following the tuple array. As we do this, we relocate dttk_value
2157 * in the final tuple to point to the key data address in the chunk.
2158 */
2159 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2160 dvar->dtdv_data = (void *)(kdata + ksize);
2161 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2162
2163 for (i = 0; i < nkeys; i++) {
2164 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2165 size_t kesize = key[i].dttk_size;
2166
2167 if (kesize != 0) {
2168 dtrace_bcopy(
2169 (const void *)(uintptr_t)key[i].dttk_value,
2170 (void *)kdata, kesize);
2171 dkey->dttk_value = kdata;
2172 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2173 } else {
2174 dkey->dttk_value = key[i].dttk_value;
2175 }
2176
2177 dkey->dttk_size = kesize;
2178 }
2179
bmc586d07d2006-04-03 10:37:01 -07002180 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002181 dvar->dtdv_hashval = hashval;
2182 dvar->dtdv_next = start;
2183
2184 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2185 return (dvar);
2186
2187 /*
2188 * The cas has failed. Either another CPU is adding an element to
2189 * this hash chain, or another CPU is deleting an element from this
2190 * hash chain. The simplest way to deal with both of these cases
2191 * (though not necessarily the most efficient) is to free our
Bryan Cantrilld47448f2015-05-14 07:40:18 +00002192 * allocated block and re-attempt it all. Note that the free is
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002193 * to the dirty list and _not_ to the free list. This is to prevent
2194 * races with allocators, above.
2195 */
bmc586d07d2006-04-03 10:37:01 -07002196 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002197
2198 dtrace_membar_producer();
2199
2200 do {
2201 free = dcpu->dtdsc_dirty;
2202 dvar->dtdv_next = free;
2203 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2204
Bryan Cantrilld47448f2015-05-14 07:40:18 +00002205 goto top;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002206}
2207
bmca1b5e532005-08-30 14:48:05 -07002208/*ARGSUSED*/
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002209static void
bmca1b5e532005-08-30 14:48:05 -07002210dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002211{
jhaslam6e0bee72008-02-07 06:05:33 -08002212 if ((int64_t)nval < (int64_t)*oval)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002213 *oval = nval;
2214}
2215
bmca1b5e532005-08-30 14:48:05 -07002216/*ARGSUSED*/
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002217static void
bmca1b5e532005-08-30 14:48:05 -07002218dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002219{
jhaslam6e0bee72008-02-07 06:05:33 -08002220 if ((int64_t)nval > (int64_t)*oval)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002221 *oval = nval;
2222}
2223
2224static void
bmca1b5e532005-08-30 14:48:05 -07002225dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002226{
2227 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2228 int64_t val = (int64_t)nval;
2229
2230 if (val < 0) {
2231 for (i = 0; i < zero; i++) {
2232 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
bmca1b5e532005-08-30 14:48:05 -07002233 quanta[i] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002234 return;
2235 }
2236 }
2237 } else {
2238 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2239 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
bmca1b5e532005-08-30 14:48:05 -07002240 quanta[i - 1] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002241 return;
2242 }
2243 }
2244
bmca1b5e532005-08-30 14:48:05 -07002245 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002246 return;
2247 }
2248
2249 ASSERT(0);
2250}
2251
2252static void
bmca1b5e532005-08-30 14:48:05 -07002253dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002254{
2255 uint64_t arg = *lquanta++;
2256 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2257 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2258 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2259 int32_t val = (int32_t)nval, level;
2260
2261 ASSERT(step != 0);
2262 ASSERT(levels != 0);
2263
2264 if (val < base) {
2265 /*
2266 * This is an underflow.
2267 */
bmca1b5e532005-08-30 14:48:05 -07002268 lquanta[0] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002269 return;
2270 }
2271
2272 level = (val - base) / step;
2273
2274 if (level < levels) {
bmca1b5e532005-08-30 14:48:05 -07002275 lquanta[level + 1] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002276 return;
2277 }
2278
2279 /*
2280 * This is an overflow.
2281 */
bmca1b5e532005-08-30 14:48:05 -07002282 lquanta[levels + 1] += incr;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002283}
2284
Bryan Cantrill2b6389e2011-02-08 01:46:16 -08002285static int
2286dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2287 uint16_t high, uint16_t nsteps, int64_t value)
2288{
2289 int64_t this = 1, last, next;
2290 int base = 1, order;
2291
2292 ASSERT(factor <= nsteps);
2293 ASSERT(nsteps % factor == 0);
2294
2295 for (order = 0; order < low; order++)
2296 this *= factor;
2297
2298 /*
2299 * If our value is less than our factor taken to the power of the
2300 * low order of magnitude, it goes into the zeroth bucket.
2301 */
2302 if (value < (last = this))
2303 return (0);
2304
2305 for (this *= factor; order <= high; order++) {
2306 int nbuckets = this > nsteps ? nsteps : this;
2307
2308 if ((next = this * factor) < this) {
2309 /*
2310 * We should not generally get log/linear quantizations
2311 * with a high magnitude that allows 64-bits to
2312 * overflow, but we nonetheless protect against this
2313 * by explicitly checking for overflow, and clamping
2314 * our value accordingly.
2315 */
2316 value = this - 1;
2317 }
2318
2319 if (value < this) {
2320 /*
2321 * If our value lies within this order of magnitude,
2322 * determine its position by taking the offset within
2323 * the order of magnitude, dividing by the bucket
2324 * width, and adding to our (accumulated) base.
2325 */
2326 return (base + (value - last) / (this / nbuckets));
2327 }
2328
2329 base += nbuckets - (nbuckets / factor);
2330 last = this;
2331 this = next;
2332 }
2333
2334 /*
2335 * Our value is greater than or equal to our factor taken to the
2336 * power of one plus the high magnitude -- return the top bucket.
2337 */
2338 return (base);
2339}
2340
2341static void
2342dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2343{
2344 uint64_t arg = *llquanta++;
2345 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2346 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2347 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2348 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2349
2350 llquanta[dtrace_aggregate_llquantize_bucket(factor,
2351 low, high, nsteps, nval)] += incr;
2352}
2353
bmca1b5e532005-08-30 14:48:05 -07002354/*ARGSUSED*/
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002355static void
bmca1b5e532005-08-30 14:48:05 -07002356dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002357{
2358 data[0]++;
2359 data[1] += nval;
2360}
2361
2362/*ARGSUSED*/
2363static void
jhaslam6e0bee72008-02-07 06:05:33 -08002364dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2365{
2366 int64_t snval = (int64_t)nval;
2367 uint64_t tmp[2];
2368
2369 data[0]++;
2370 data[1] += nval;
2371
2372 /*
2373 * What we want to say here is:
2374 *
2375 * data[2] += nval * nval;
2376 *
2377 * But given that nval is 64-bit, we could easily overflow, so
2378 * we do this as 128-bit arithmetic.
2379 */
2380 if (snval < 0)
2381 snval = -snval;
2382
2383 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2384 dtrace_add_128(data + 2, tmp, data + 2);
2385}
2386
2387/*ARGSUSED*/
2388static void
bmca1b5e532005-08-30 14:48:05 -07002389dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002390{
2391 *oval = *oval + 1;
2392}
2393
2394/*ARGSUSED*/
2395static void
bmca1b5e532005-08-30 14:48:05 -07002396dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002397{
2398 *oval += nval;
2399}
2400
2401/*
2402 * Aggregate given the tuple in the principal data buffer, and the aggregating
2403 * action denoted by the specified dtrace_aggregation_t. The aggregation
2404 * buffer is specified as the buf parameter. This routine does not return
2405 * failure; if there is no space in the aggregation buffer, the data will be
2406 * dropped, and a corresponding counter incremented.
2407 */
2408static void
2409dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
bmca1b5e532005-08-30 14:48:05 -07002410 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002411{
2412 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2413 uint32_t i, ndx, size, fsize;
2414 uint32_t align =