blob: e9af19ca182c879ecdea46fa8aadba619ddabe8b [file] [log] [blame]
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
dr146992381a2a92006-10-20 16:37:58 -07005 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07007 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
22/* All Rights Reserved */
23
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070024/*
meema45f3f92009-02-10 20:25:26 -050025 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070026 * Use is subject to license terms.
27 */
28
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070029#include <sys/types.h>
30#include <sys/param.h>
31#include <sys/thread.h>
32#include <sys/sysmacros.h>
33#include <sys/stropts.h>
34#include <sys/stream.h>
35#include <sys/strsubr.h>
36#include <sys/strsun.h>
37#include <sys/conf.h>
38#include <sys/debug.h>
39#include <sys/cmn_err.h>
40#include <sys/kmem.h>
41#include <sys/atomic.h>
42#include <sys/errno.h>
43#include <sys/vtrace.h>
44#include <sys/ftrace.h>
45#include <sys/ontrap.h>
46#include <sys/multidata.h>
47#include <sys/multidata_impl.h>
48#include <sys/sdt.h>
meemae352852005-12-13 13:12:26 -080049#include <sys/strft.h>
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -070050
51#ifdef DEBUG
52#include <sys/kmem_impl.h>
53#endif
54
55/*
56 * This file contains all the STREAMS utility routines that may
57 * be used by modules and drivers.
58 */
59
60/*
61 * STREAMS message allocator: principles of operation
62 *
63 * The streams message allocator consists of all the routines that
64 * allocate, dup and free streams messages: allocb(), [d]esballoc[a],
65 * dupb(), freeb() and freemsg(). What follows is a high-level view
66 * of how the allocator works.
67 *
68 * Every streams message consists of one or more mblks, a dblk, and data.
69 * All mblks for all types of messages come from a common mblk_cache.
70 * The dblk and data come in several flavors, depending on how the
71 * message is allocated:
72 *
73 * (1) mblks up to DBLK_MAX_CACHE size are allocated from a collection of
74 * fixed-size dblk/data caches. For message sizes that are multiples of
75 * PAGESIZE, dblks are allocated separately from the buffer.
76 * The associated buffer is allocated by the constructor using kmem_alloc().
77 * For all other message sizes, dblk and its associated data is allocated
78 * as a single contiguous chunk of memory.
79 * Objects in these caches consist of a dblk plus its associated data.
80 * allocb() determines the nearest-size cache by table lookup:
81 * the dblk_cache[] array provides the mapping from size to dblk cache.
82 *
83 * (2) Large messages (size > DBLK_MAX_CACHE) are constructed by
84 * kmem_alloc()'ing a buffer for the data and supplying that
85 * buffer to gesballoc(), described below.
86 *
87 * (3) The four flavors of [d]esballoc[a] are all implemented by a
88 * common routine, gesballoc() ("generic esballoc"). gesballoc()
89 * allocates a dblk from the global dblk_esb_cache and sets db_base,
90 * db_lim and db_frtnp to describe the caller-supplied buffer.
91 *
92 * While there are several routines to allocate messages, there is only
93 * one routine to free messages: freeb(). freeb() simply invokes the
94 * dblk's free method, dbp->db_free(), which is set at allocation time.
95 *
96 * dupb() creates a new reference to a message by allocating a new mblk,
97 * incrementing the dblk reference count and setting the dblk's free
98 * method to dblk_decref(). The dblk's original free method is retained
99 * in db_lastfree. dblk_decref() decrements the reference count on each
100 * freeb(). If this is not the last reference it just frees the mblk;
101 * if this *is* the last reference, it restores db_free to db_lastfree,
102 * sets db_mblk to the current mblk (see below), and invokes db_lastfree.
103 *
104 * The implementation makes aggressive use of kmem object caching for
105 * maximum performance. This makes the code simple and compact, but
106 * also a bit abstruse in some places. The invariants that constitute a
107 * message's constructed state, described below, are more subtle than usual.
108 *
109 * Every dblk has an "attached mblk" as part of its constructed state.
110 * The mblk is allocated by the dblk's constructor and remains attached
111 * until the message is either dup'ed or pulled up. In the dupb() case
112 * the mblk association doesn't matter until the last free, at which time
113 * dblk_decref() attaches the last mblk to the dblk. pullupmsg() affects
114 * the mblk association because it swaps the leading mblks of two messages,
115 * so it is responsible for swapping their db_mblk pointers accordingly.
116 * From a constructed-state viewpoint it doesn't matter that a dblk's
117 * attached mblk can change while the message is allocated; all that
118 * matters is that the dblk has *some* attached mblk when it's freed.
119 *
120 * The sizes of the allocb() small-message caches are not magical.
121 * They represent a good trade-off between internal and external
122 * fragmentation for current workloads. They should be reevaluated
123 * periodically, especially if allocations larger than DBLK_MAX_CACHE
124 * become common. We use 64-byte alignment so that dblks don't
125 * straddle cache lines unnecessarily.
126 */
127#define DBLK_MAX_CACHE 73728
128#define DBLK_CACHE_ALIGN 64
129#define DBLK_MIN_SIZE 8
130#define DBLK_SIZE_SHIFT 3
131
132#ifdef _BIG_ENDIAN
133#define DBLK_RTFU_SHIFT(field) \
134 (8 * (&((dblk_t *)0)->db_struioflag - &((dblk_t *)0)->field))
135#else
136#define DBLK_RTFU_SHIFT(field) \
137 (8 * (&((dblk_t *)0)->field - &((dblk_t *)0)->db_ref))
138#endif
139
140#define DBLK_RTFU(ref, type, flags, uioflag) \
141 (((ref) << DBLK_RTFU_SHIFT(db_ref)) | \
142 ((type) << DBLK_RTFU_SHIFT(db_type)) | \
143 (((flags) | (ref - 1)) << DBLK_RTFU_SHIFT(db_flags)) | \
144 ((uioflag) << DBLK_RTFU_SHIFT(db_struioflag)))
145#define DBLK_RTFU_REF_MASK (DBLK_REFMAX << DBLK_RTFU_SHIFT(db_ref))
146#define DBLK_RTFU_WORD(dbp) (*((uint32_t *)&(dbp)->db_ref))
147#define MBLK_BAND_FLAG_WORD(mp) (*((uint32_t *)&(mp)->b_band))
148
149static size_t dblk_sizes[] = {
150#ifdef _LP64
tomeeb5fca8f2008-05-26 17:53:26 -0700151 16, 80, 144, 208, 272, 336, 528, 1040, 1488, 1936, 2576, 3856,
152 8192, 12048, 16384, 20240, 24576, 28432, 32768, 36624,
153 40960, 44816, 49152, 53008, 57344, 61200, 65536, 69392,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700154#else
tomeeb5fca8f2008-05-26 17:53:26 -0700155 64, 128, 320, 576, 1088, 1536, 1984, 2624, 3904,
156 8192, 12096, 16384, 20288, 24576, 28480, 32768, 36672,
157 40960, 44864, 49152, 53056, 57344, 61248, 65536, 69440,
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700158#endif
159 DBLK_MAX_CACHE, 0
160};
161
162static struct kmem_cache *dblk_cache[DBLK_MAX_CACHE / DBLK_MIN_SIZE];
163static struct kmem_cache *mblk_cache;
164static struct kmem_cache *dblk_esb_cache;
165static struct kmem_cache *fthdr_cache;
166static struct kmem_cache *ftblk_cache;
167
168static void dblk_lastfree(mblk_t *mp, dblk_t *dbp);
169static mblk_t *allocb_oversize(size_t size, int flags);
170static int allocb_tryhard_fails;
171static void frnop_func(void *arg);
172frtn_t frnop = { frnop_func };
173static void bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp);
174
175static boolean_t rwnext_enter(queue_t *qp);
176static void rwnext_exit(queue_t *qp);
177
178/*
179 * Patchable mblk/dblk kmem_cache flags.
180 */
181int dblk_kmem_flags = 0;
182int mblk_kmem_flags = 0;
183
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700184static int
185dblk_constructor(void *buf, void *cdrarg, int kmflags)
186{
187 dblk_t *dbp = buf;
188 ssize_t msg_size = (ssize_t)cdrarg;
189 size_t index;
190
191 ASSERT(msg_size != 0);
192
193 index = (msg_size - 1) >> DBLK_SIZE_SHIFT;
194
meeme4506d62005-09-20 11:34:07 -0700195 ASSERT(index < (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT));
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700196
197 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
198 return (-1);
199 if ((msg_size & PAGEOFFSET) == 0) {
200 dbp->db_base = kmem_alloc(msg_size, kmflags);
201 if (dbp->db_base == NULL) {
202 kmem_cache_free(mblk_cache, dbp->db_mblk);
203 return (-1);
204 }
205 } else {
206 dbp->db_base = (unsigned char *)&dbp[1];
207 }
208
209 dbp->db_mblk->b_datap = dbp;
210 dbp->db_cache = dblk_cache[index];
211 dbp->db_lim = dbp->db_base + msg_size;
212 dbp->db_free = dbp->db_lastfree = dblk_lastfree;
213 dbp->db_frtnp = NULL;
214 dbp->db_fthdr = NULL;
215 dbp->db_credp = NULL;
216 dbp->db_cpid = -1;
217 dbp->db_struioflag = 0;
218 dbp->db_struioun.cksum.flags = 0;
219 return (0);
220}
221
222/*ARGSUSED*/
223static int
224dblk_esb_constructor(void *buf, void *cdrarg, int kmflags)
225{
226 dblk_t *dbp = buf;
227
228 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
229 return (-1);
230 dbp->db_mblk->b_datap = dbp;
231 dbp->db_cache = dblk_esb_cache;
232 dbp->db_fthdr = NULL;
233 dbp->db_credp = NULL;
234 dbp->db_cpid = -1;
235 dbp->db_struioflag = 0;
236 dbp->db_struioun.cksum.flags = 0;
237 return (0);
238}
239
240static int
241bcache_dblk_constructor(void *buf, void *cdrarg, int kmflags)
242{
243 dblk_t *dbp = buf;
meema45f3f92009-02-10 20:25:26 -0500244 bcache_t *bcp = cdrarg;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700245
246 if ((dbp->db_mblk = kmem_cache_alloc(mblk_cache, kmflags)) == NULL)
247 return (-1);
248
meema45f3f92009-02-10 20:25:26 -0500249 dbp->db_base = kmem_cache_alloc(bcp->buffer_cache, kmflags);
250 if (dbp->db_base == NULL) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700251 kmem_cache_free(mblk_cache, dbp->db_mblk);
252 return (-1);
253 }
254
255 dbp->db_mblk->b_datap = dbp;
256 dbp->db_cache = (void *)bcp;
257 dbp->db_lim = dbp->db_base + bcp->size;
258 dbp->db_free = dbp->db_lastfree = bcache_dblk_lastfree;
259 dbp->db_frtnp = NULL;
260 dbp->db_fthdr = NULL;
261 dbp->db_credp = NULL;
262 dbp->db_cpid = -1;
263 dbp->db_struioflag = 0;
264 dbp->db_struioun.cksum.flags = 0;
265 return (0);
266}
267
268/*ARGSUSED*/
269static void
270dblk_destructor(void *buf, void *cdrarg)
271{
272 dblk_t *dbp = buf;
273 ssize_t msg_size = (ssize_t)cdrarg;
274
275 ASSERT(dbp->db_mblk->b_datap == dbp);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700276 ASSERT(msg_size != 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700277 ASSERT(dbp->db_struioflag == 0);
278 ASSERT(dbp->db_struioun.cksum.flags == 0);
279
280 if ((msg_size & PAGEOFFSET) == 0) {
281 kmem_free(dbp->db_base, msg_size);
282 }
283
284 kmem_cache_free(mblk_cache, dbp->db_mblk);
285}
286
287static void
288bcache_dblk_destructor(void *buf, void *cdrarg)
289{
290 dblk_t *dbp = buf;
meema45f3f92009-02-10 20:25:26 -0500291 bcache_t *bcp = cdrarg;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700292
293 kmem_cache_free(bcp->buffer_cache, dbp->db_base);
294
295 ASSERT(dbp->db_mblk->b_datap == dbp);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700296 ASSERT(dbp->db_struioflag == 0);
297 ASSERT(dbp->db_struioun.cksum.flags == 0);
298
299 kmem_cache_free(mblk_cache, dbp->db_mblk);
300}
301
meema45f3f92009-02-10 20:25:26 -0500302/* ARGSUSED */
303static int
304ftblk_constructor(void *buf, void *cdrarg, int kmflags)
305{
306 ftblk_t *fbp = buf;
307 int i;
308
309 bzero(fbp, sizeof (ftblk_t));
310 if (str_ftstack != 0) {
311 for (i = 0; i < FTBLK_EVNTS; i++)
312 fbp->ev[i].stk = kmem_alloc(sizeof (ftstk_t), kmflags);
313 }
314
315 return (0);
316}
317
318/* ARGSUSED */
319static void
320ftblk_destructor(void *buf, void *cdrarg)
321{
322 ftblk_t *fbp = buf;
323 int i;
324
325 if (str_ftstack != 0) {
326 for (i = 0; i < FTBLK_EVNTS; i++) {
327 if (fbp->ev[i].stk != NULL) {
328 kmem_free(fbp->ev[i].stk, sizeof (ftstk_t));
329 fbp->ev[i].stk = NULL;
330 }
331 }
332 }
333}
334
335static int
336fthdr_constructor(void *buf, void *cdrarg, int kmflags)
337{
338 fthdr_t *fhp = buf;
339
340 return (ftblk_constructor(&fhp->first, cdrarg, kmflags));
341}
342
343static void
344fthdr_destructor(void *buf, void *cdrarg)
345{
346 fthdr_t *fhp = buf;
347
348 ftblk_destructor(&fhp->first, cdrarg);
349}
350
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700351void
352streams_msg_init(void)
353{
354 char name[40];
355 size_t size;
356 size_t lastsize = DBLK_MIN_SIZE;
357 size_t *sizep;
358 struct kmem_cache *cp;
359 size_t tot_size;
360 int offset;
361
meema45f3f92009-02-10 20:25:26 -0500362 mblk_cache = kmem_cache_create("streams_mblk", sizeof (mblk_t), 32,
363 NULL, NULL, NULL, NULL, NULL, mblk_kmem_flags);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700364
365 for (sizep = dblk_sizes; (size = *sizep) != 0; sizep++) {
366
367 if ((offset = (size & PAGEOFFSET)) != 0) {
368 /*
369 * We are in the middle of a page, dblk should
370 * be allocated on the same page
371 */
372 tot_size = size + sizeof (dblk_t);
373 ASSERT((offset + sizeof (dblk_t) + sizeof (kmem_slab_t))
brutus17169042008-05-23 20:14:10 -0700374 < PAGESIZE);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700375 ASSERT((tot_size & (DBLK_CACHE_ALIGN - 1)) == 0);
376
377 } else {
378
379 /*
380 * buf size is multiple of page size, dblk and
381 * buffer are allocated separately.
382 */
383
384 ASSERT((size & (DBLK_CACHE_ALIGN - 1)) == 0);
385 tot_size = sizeof (dblk_t);
386 }
387
388 (void) sprintf(name, "streams_dblk_%ld", size);
meema45f3f92009-02-10 20:25:26 -0500389 cp = kmem_cache_create(name, tot_size, DBLK_CACHE_ALIGN,
390 dblk_constructor, dblk_destructor, NULL, (void *)(size),
391 NULL, dblk_kmem_flags);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700392
393 while (lastsize <= size) {
394 dblk_cache[(lastsize - 1) >> DBLK_SIZE_SHIFT] = cp;
395 lastsize += DBLK_MIN_SIZE;
396 }
397 }
398
meema45f3f92009-02-10 20:25:26 -0500399 dblk_esb_cache = kmem_cache_create("streams_dblk_esb", sizeof (dblk_t),
400 DBLK_CACHE_ALIGN, dblk_esb_constructor, dblk_destructor, NULL,
401 (void *)sizeof (dblk_t), NULL, dblk_kmem_flags);
402 fthdr_cache = kmem_cache_create("streams_fthdr", sizeof (fthdr_t), 32,
403 fthdr_constructor, fthdr_destructor, NULL, NULL, NULL, 0);
404 ftblk_cache = kmem_cache_create("streams_ftblk", sizeof (ftblk_t), 32,
405 ftblk_constructor, ftblk_destructor, NULL, NULL, NULL, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700406
407 /* Initialize Multidata caches */
408 mmd_init();
ss146032e7d4b762007-03-30 04:14:30 -0700409
410 /* initialize throttling queue for esballoc */
411 esballoc_queue_init();
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700412}
413
414/*ARGSUSED*/
415mblk_t *
416allocb(size_t size, uint_t pri)
417{
418 dblk_t *dbp;
419 mblk_t *mp;
420 size_t index;
421
422 index = (size - 1) >> DBLK_SIZE_SHIFT;
423
424 if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
425 if (size != 0) {
426 mp = allocb_oversize(size, KM_NOSLEEP);
427 goto out;
428 }
429 index = 0;
430 }
431
432 if ((dbp = kmem_cache_alloc(dblk_cache[index], KM_NOSLEEP)) == NULL) {
433 mp = NULL;
434 goto out;
435 }
436
437 mp = dbp->db_mblk;
438 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
439 mp->b_next = mp->b_prev = mp->b_cont = NULL;
440 mp->b_rptr = mp->b_wptr = dbp->db_base;
441 mp->b_queue = NULL;
442 MBLK_BAND_FLAG_WORD(mp) = 0;
443 STR_FTALLOC(&dbp->db_fthdr, FTEV_ALLOCB, size);
444out:
445 FTRACE_1("allocb(): mp=0x%p", (uintptr_t)mp);
446
447 return (mp);
448}
449
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800450/*
451 * Allocate an mblk taking db_credp and db_cpid from the template.
452 * Allow the cred to be NULL.
453 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700454mblk_t *
455allocb_tmpl(size_t size, const mblk_t *tmpl)
456{
457 mblk_t *mp = allocb(size, 0);
458
459 if (mp != NULL) {
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800460 dblk_t *src = tmpl->b_datap;
461 dblk_t *dst = mp->b_datap;
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400462 cred_t *cr;
463 pid_t cpid;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800464
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400465 cr = msg_getcred(tmpl, &cpid);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700466 if (cr != NULL)
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800467 crhold(dst->db_credp = cr);
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400468 dst->db_cpid = cpid;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800469 dst->db_type = src->db_type;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700470 }
471 return (mp);
472}
473
474mblk_t *
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800475allocb_cred(size_t size, cred_t *cr, pid_t cpid)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700476{
477 mblk_t *mp = allocb(size, 0);
478
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800479 ASSERT(cr != NULL);
480 if (mp != NULL) {
481 dblk_t *dbp = mp->b_datap;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700482
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800483 crhold(dbp->db_credp = cr);
484 dbp->db_cpid = cpid;
485 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700486 return (mp);
487}
488
489mblk_t *
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800490allocb_cred_wait(size_t size, uint_t flags, int *error, cred_t *cr, pid_t cpid)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700491{
492 mblk_t *mp = allocb_wait(size, 0, flags, error);
493
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800494 ASSERT(cr != NULL);
495 if (mp != NULL) {
496 dblk_t *dbp = mp->b_datap;
497
498 crhold(dbp->db_credp = cr);
499 dbp->db_cpid = cpid;
500 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700501
502 return (mp);
503}
504
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800505/*
506 * Extract the db_cred (and optionally db_cpid) from a message.
507 * We find the first mblk which has a non-NULL db_cred and use that.
508 * If none found we return NULL.
509 * Does NOT get a hold on the cred.
510 */
511cred_t *
512msg_getcred(const mblk_t *mp, pid_t *cpidp)
513{
514 cred_t *cr = NULL;
515 cred_t *cr2;
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400516 mblk_t *mp2;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800517
518 while (mp != NULL) {
519 dblk_t *dbp = mp->b_datap;
520
521 cr = dbp->db_credp;
522 if (cr == NULL) {
523 mp = mp->b_cont;
524 continue;
525 }
526 if (cpidp != NULL)
527 *cpidp = dbp->db_cpid;
528
529#ifdef DEBUG
530 /*
531 * Normally there should at most one db_credp in a message.
532 * But if there are multiple (as in the case of some M_IOC*
533 * and some internal messages in TCP/IP bind logic) then
534 * they must be identical in the normal case.
535 * However, a socket can be shared between different uids
536 * in which case data queued in TCP would be from different
537 * creds. Thus we can only assert for the zoneid being the
538 * same. Due to Multi-level Level Ports for TX, some
539 * cred_t can have a NULL cr_zone, and we skip the comparison
540 * in that case.
541 */
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400542 mp2 = mp->b_cont;
543 while (mp2 != NULL) {
544 cr2 = DB_CRED(mp2);
545 if (cr2 != NULL) {
546 DTRACE_PROBE2(msg__getcred,
547 cred_t *, cr, cred_t *, cr2);
548 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
549 crgetzone(cr) == NULL ||
550 crgetzone(cr2) == NULL);
551 }
552 mp2 = mp2->b_cont;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800553 }
554#endif
555 return (cr);
556 }
557 if (cpidp != NULL)
558 *cpidp = NOPID;
559 return (NULL);
560}
561
562/*
563 * Variant of msg_getcred which, when a cred is found
564 * 1. Returns with a hold on the cred
565 * 2. Clears the first cred in the mblk.
566 * This is more efficient to use than a msg_getcred() + crhold() when
567 * the message is freed after the cred has been extracted.
568 *
569 * The caller is responsible for ensuring that there is no other reference
570 * on the message since db_credp can not be cleared when there are other
571 * references.
572 */
573cred_t *
574msg_extractcred(mblk_t *mp, pid_t *cpidp)
575{
576 cred_t *cr = NULL;
577 cred_t *cr2;
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400578 mblk_t *mp2;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800579
580 while (mp != NULL) {
581 dblk_t *dbp = mp->b_datap;
582
583 cr = dbp->db_credp;
584 if (cr == NULL) {
585 mp = mp->b_cont;
586 continue;
587 }
588 ASSERT(dbp->db_ref == 1);
589 dbp->db_credp = NULL;
590 if (cpidp != NULL)
591 *cpidp = dbp->db_cpid;
592#ifdef DEBUG
593 /*
594 * Normally there should at most one db_credp in a message.
595 * But if there are multiple (as in the case of some M_IOC*
596 * and some internal messages in TCP/IP bind logic) then
597 * they must be identical in the normal case.
598 * However, a socket can be shared between different uids
599 * in which case data queued in TCP would be from different
600 * creds. Thus we can only assert for the zoneid being the
601 * same. Due to Multi-level Level Ports for TX, some
602 * cred_t can have a NULL cr_zone, and we skip the comparison
603 * in that case.
604 */
ken Powell - Sun Microsystemaa62bbf2009-07-22 23:41:00 -0400605 mp2 = mp->b_cont;
606 while (mp2 != NULL) {
607 cr2 = DB_CRED(mp2);
608 if (cr2 != NULL) {
609 DTRACE_PROBE2(msg__extractcred,
610 cred_t *, cr, cred_t *, cr2);
611 ASSERT(crgetzoneid(cr) == crgetzoneid(cr2) ||
612 crgetzone(cr) == NULL ||
613 crgetzone(cr2) == NULL);
614 }
615 mp2 = mp2->b_cont;
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800616 }
617#endif
618 return (cr);
619 }
620 return (NULL);
621}
622/*
623 * Get the label for a message. Uses the first mblk in the message
624 * which has a non-NULL db_credp.
625 * Returns NULL if there is no credp.
626 */
627extern struct ts_label_s *
628msg_getlabel(const mblk_t *mp)
629{
630 cred_t *cr = msg_getcred(mp, NULL);
631
632 if (cr == NULL)
633 return (NULL);
634
635 return (crgetlabel(cr));
636}
637
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700638void
639freeb(mblk_t *mp)
640{
641 dblk_t *dbp = mp->b_datap;
642
643 ASSERT(dbp->db_ref > 0);
644 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
645 FTRACE_1("freeb(): mp=0x%lx", (uintptr_t)mp);
646
647 STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
648
649 dbp->db_free(mp, dbp);
650}
651
652void
653freemsg(mblk_t *mp)
654{
655 FTRACE_1("freemsg(): mp=0x%lx", (uintptr_t)mp);
656 while (mp) {
657 dblk_t *dbp = mp->b_datap;
658 mblk_t *mp_cont = mp->b_cont;
659
660 ASSERT(dbp->db_ref > 0);
661 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
662
663 STR_FTEVENT_MBLK(mp, caller(), FTEV_FREEB, dbp->db_ref);
664
665 dbp->db_free(mp, dbp);
666 mp = mp_cont;
667 }
668}
669
670/*
671 * Reallocate a block for another use. Try hard to use the old block.
672 * If the old data is wanted (copy), leave b_wptr at the end of the data,
673 * otherwise return b_wptr = b_rptr.
674 *
675 * This routine is private and unstable.
676 */
677mblk_t *
678reallocb(mblk_t *mp, size_t size, uint_t copy)
679{
680 mblk_t *mp1;
681 unsigned char *old_rptr;
682 ptrdiff_t cur_size;
683
684 if (mp == NULL)
685 return (allocb(size, BPRI_HI));
686
687 cur_size = mp->b_wptr - mp->b_rptr;
688 old_rptr = mp->b_rptr;
689
690 ASSERT(mp->b_datap->db_ref != 0);
691
692 if (mp->b_datap->db_ref == 1 && MBLKSIZE(mp) >= size) {
693 /*
694 * If the data is wanted and it will fit where it is, no
695 * work is required.
696 */
697 if (copy && mp->b_datap->db_lim - mp->b_rptr >= size)
698 return (mp);
699
700 mp->b_wptr = mp->b_rptr = mp->b_datap->db_base;
701 mp1 = mp;
702 } else if ((mp1 = allocb_tmpl(size, mp)) != NULL) {
703 /* XXX other mp state could be copied too, db_flags ... ? */
704 mp1->b_cont = mp->b_cont;
705 } else {
706 return (NULL);
707 }
708
709 if (copy) {
710 bcopy(old_rptr, mp1->b_rptr, cur_size);
711 mp1->b_wptr = mp1->b_rptr + cur_size;
712 }
713
714 if (mp != mp1)
715 freeb(mp);
716
717 return (mp1);
718}
719
720static void
721dblk_lastfree(mblk_t *mp, dblk_t *dbp)
722{
723 ASSERT(dbp->db_mblk == mp);
724 if (dbp->db_fthdr != NULL)
725 str_ftfree(dbp);
726
727 /* set credp and projid to be 'unspecified' before returning to cache */
728 if (dbp->db_credp != NULL) {
729 crfree(dbp->db_credp);
730 dbp->db_credp = NULL;
731 }
732 dbp->db_cpid = -1;
733
734 /* Reset the struioflag and the checksum flag fields */
735 dbp->db_struioflag = 0;
736 dbp->db_struioun.cksum.flags = 0;
737
brutus17169042008-05-23 20:14:10 -0700738 /* and the COOKED and/or UIOA flag(s) */
739 dbp->db_flags &= ~(DBLK_COOKED | DBLK_UIOA);
kaisc28749e2005-11-12 18:58:05 -0800740
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700741 kmem_cache_free(dbp->db_cache, dbp);
742}
743
744static void
745dblk_decref(mblk_t *mp, dblk_t *dbp)
746{
747 if (dbp->db_ref != 1) {
748 uint32_t rtfu = atomic_add_32_nv(&DBLK_RTFU_WORD(dbp),
749 -(1 << DBLK_RTFU_SHIFT(db_ref)));
750 /*
751 * atomic_add_32_nv() just decremented db_ref, so we no longer
752 * have a reference to the dblk, which means another thread
753 * could free it. Therefore we cannot examine the dblk to
754 * determine whether ours was the last reference. Instead,
755 * we extract the new and minimum reference counts from rtfu.
756 * Note that all we're really saying is "if (ref != refmin)".
757 */
758 if (((rtfu >> DBLK_RTFU_SHIFT(db_ref)) & DBLK_REFMAX) !=
759 ((rtfu >> DBLK_RTFU_SHIFT(db_flags)) & DBLK_REFMIN)) {
760 kmem_cache_free(mblk_cache, mp);
761 return;
762 }
763 }
764 dbp->db_mblk = mp;
765 dbp->db_free = dbp->db_lastfree;
766 dbp->db_lastfree(mp, dbp);
767}
768
769mblk_t *
770dupb(mblk_t *mp)
771{
772 dblk_t *dbp = mp->b_datap;
773 mblk_t *new_mp;
774 uint32_t oldrtfu, newrtfu;
775
776 if ((new_mp = kmem_cache_alloc(mblk_cache, KM_NOSLEEP)) == NULL)
777 goto out;
778
779 new_mp->b_next = new_mp->b_prev = new_mp->b_cont = NULL;
780 new_mp->b_rptr = mp->b_rptr;
781 new_mp->b_wptr = mp->b_wptr;
782 new_mp->b_datap = dbp;
783 new_mp->b_queue = NULL;
784 MBLK_BAND_FLAG_WORD(new_mp) = MBLK_BAND_FLAG_WORD(mp);
785
786 STR_FTEVENT_MBLK(mp, caller(), FTEV_DUPB, dbp->db_ref);
787
georgesaa3da232006-11-24 01:29:43 -0800788 dbp->db_free = dblk_decref;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700789 do {
790 ASSERT(dbp->db_ref > 0);
791 oldrtfu = DBLK_RTFU_WORD(dbp);
792 newrtfu = oldrtfu + (1 << DBLK_RTFU_SHIFT(db_ref));
793 /*
794 * If db_ref is maxed out we can't dup this message anymore.
795 */
796 if ((oldrtfu & DBLK_RTFU_REF_MASK) == DBLK_RTFU_REF_MASK) {
797 kmem_cache_free(mblk_cache, new_mp);
798 new_mp = NULL;
799 goto out;
800 }
Josef 'Jeff' Sipek75d94462014-08-08 10:27:20 -0400801 } while (atomic_cas_32(&DBLK_RTFU_WORD(dbp), oldrtfu, newrtfu) !=
802 oldrtfu);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700803
804out:
805 FTRACE_1("dupb(): new_mp=0x%lx", (uintptr_t)new_mp);
806 return (new_mp);
807}
808
809static void
810dblk_lastfree_desb(mblk_t *mp, dblk_t *dbp)
811{
812 frtn_t *frp = dbp->db_frtnp;
813
814 ASSERT(dbp->db_mblk == mp);
815 frp->free_func(frp->free_arg);
816 if (dbp->db_fthdr != NULL)
817 str_ftfree(dbp);
818
819 /* set credp and projid to be 'unspecified' before returning to cache */
820 if (dbp->db_credp != NULL) {
821 crfree(dbp->db_credp);
822 dbp->db_credp = NULL;
823 }
824 dbp->db_cpid = -1;
825 dbp->db_struioflag = 0;
826 dbp->db_struioun.cksum.flags = 0;
827
828 kmem_cache_free(dbp->db_cache, dbp);
829}
830
831/*ARGSUSED*/
832static void
833frnop_func(void *arg)
834{
835}
836
837/*
838 * Generic esballoc used to implement the four flavors: [d]esballoc[a].
839 */
840static mblk_t *
841gesballoc(unsigned char *base, size_t size, uint32_t db_rtfu, frtn_t *frp,
842 void (*lastfree)(mblk_t *, dblk_t *), int kmflags)
843{
844 dblk_t *dbp;
845 mblk_t *mp;
846
847 ASSERT(base != NULL && frp != NULL);
848
849 if ((dbp = kmem_cache_alloc(dblk_esb_cache, kmflags)) == NULL) {
850 mp = NULL;
851 goto out;
852 }
853
854 mp = dbp->db_mblk;
855 dbp->db_base = base;
856 dbp->db_lim = base + size;
857 dbp->db_free = dbp->db_lastfree = lastfree;
858 dbp->db_frtnp = frp;
859 DBLK_RTFU_WORD(dbp) = db_rtfu;
860 mp->b_next = mp->b_prev = mp->b_cont = NULL;
861 mp->b_rptr = mp->b_wptr = base;
862 mp->b_queue = NULL;
863 MBLK_BAND_FLAG_WORD(mp) = 0;
864
865out:
866 FTRACE_1("gesballoc(): mp=0x%lx", (uintptr_t)mp);
867 return (mp);
868}
869
870/*ARGSUSED*/
871mblk_t *
872esballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
873{
874 mblk_t *mp;
875
876 /*
877 * Note that this is structured to allow the common case (i.e.
878 * STREAMS flowtracing disabled) to call gesballoc() with tail
879 * call optimization.
880 */
881 if (!str_ftnever) {
882 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
883 frp, freebs_enqueue, KM_NOSLEEP);
884
885 if (mp != NULL)
886 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
887 return (mp);
888 }
889
890 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
891 frp, freebs_enqueue, KM_NOSLEEP));
892}
893
894/*
895 * Same as esballoc() but sleeps waiting for memory.
896 */
897/*ARGSUSED*/
898mblk_t *
899esballoc_wait(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
900{
901 mblk_t *mp;
902
903 /*
904 * Note that this is structured to allow the common case (i.e.
905 * STREAMS flowtracing disabled) to call gesballoc() with tail
906 * call optimization.
907 */
908 if (!str_ftnever) {
909 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
910 frp, freebs_enqueue, KM_SLEEP);
911
912 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOC, size);
913 return (mp);
914 }
915
916 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
917 frp, freebs_enqueue, KM_SLEEP));
918}
919
920/*ARGSUSED*/
921mblk_t *
922desballoc(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
923{
924 mblk_t *mp;
925
926 /*
927 * Note that this is structured to allow the common case (i.e.
928 * STREAMS flowtracing disabled) to call gesballoc() with tail
929 * call optimization.
930 */
931 if (!str_ftnever) {
932 mp = gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
brutus17169042008-05-23 20:14:10 -0700933 frp, dblk_lastfree_desb, KM_NOSLEEP);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -0700934
935 if (mp != NULL)
936 STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOC, size);
937 return (mp);
938 }
939
940 return (gesballoc(base, size, DBLK_RTFU(1, M_DATA, 0, 0),
941 frp, dblk_lastfree_desb, KM_NOSLEEP));
942}
943
944/*ARGSUSED*/
945mblk_t *
946esballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
947{
948 mblk_t *mp;
949
950 /*
951 * Note that this is structured to allow the common case (i.e.
952 * STREAMS flowtracing disabled) to call gesballoc() with tail
953 * call optimization.
954 */
955 if (!str_ftnever) {
956 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
957 frp, freebs_enqueue, KM_NOSLEEP);
958
959 if (mp != NULL)
960 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ESBALLOCA, size);
961 return (mp);
962 }
963
964 return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
965 frp, freebs_enqueue, KM_NOSLEEP));
966}
967
968/*ARGSUSED*/
969mblk_t *
970desballoca(unsigned char *base, size_t size, uint_t pri, frtn_t *frp)
971{
972 mblk_t *mp;
973
974 /*
975 * Note that this is structured to allow the common case (i.e.
976 * STREAMS flowtracing disabled) to call gesballoc() with tail
977 * call optimization.
978 */
979 if (!str_ftnever) {
980 mp = gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
981 frp, dblk_lastfree_desb, KM_NOSLEEP);
982
983 if (mp != NULL)
984 STR_FTALLOC(&DB_FTHDR(mp), FTEV_DESBALLOCA, size);
985 return (mp);
986 }
987
988 return (gesballoc(base, size, DBLK_RTFU(2, M_DATA, 0, 0),
989 frp, dblk_lastfree_desb, KM_NOSLEEP));
990}
991
992static void
993bcache_dblk_lastfree(mblk_t *mp, dblk_t *dbp)
994{
995 bcache_t *bcp = dbp->db_cache;
996
997 ASSERT(dbp->db_mblk == mp);
998 if (dbp->db_fthdr != NULL)
999 str_ftfree(dbp);
1000
1001 /* set credp and projid to be 'unspecified' before returning to cache */
1002 if (dbp->db_credp != NULL) {
1003 crfree(dbp->db_credp);
1004 dbp->db_credp = NULL;
1005 }
1006 dbp->db_cpid = -1;
1007 dbp->db_struioflag = 0;
1008 dbp->db_struioun.cksum.flags = 0;
1009
1010 mutex_enter(&bcp->mutex);
1011 kmem_cache_free(bcp->dblk_cache, dbp);
1012 bcp->alloc--;
1013
1014 if (bcp->alloc == 0 && bcp->destroy != 0) {
1015 kmem_cache_destroy(bcp->dblk_cache);
1016 kmem_cache_destroy(bcp->buffer_cache);
1017 mutex_exit(&bcp->mutex);
1018 mutex_destroy(&bcp->mutex);
1019 kmem_free(bcp, sizeof (bcache_t));
1020 } else {
1021 mutex_exit(&bcp->mutex);
1022 }
1023}
1024
1025bcache_t *
1026bcache_create(char *name, size_t size, uint_t align)
1027{
1028 bcache_t *bcp;
1029 char buffer[255];
1030
1031 ASSERT((align & (align - 1)) == 0);
1032
meema45f3f92009-02-10 20:25:26 -05001033 if ((bcp = kmem_alloc(sizeof (bcache_t), KM_NOSLEEP)) == NULL)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001034 return (NULL);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001035
1036 bcp->size = size;
1037 bcp->align = align;
1038 bcp->alloc = 0;
1039 bcp->destroy = 0;
1040
1041 mutex_init(&bcp->mutex, NULL, MUTEX_DRIVER, NULL);
1042
1043 (void) sprintf(buffer, "%s_buffer_cache", name);
1044 bcp->buffer_cache = kmem_cache_create(buffer, size, align, NULL, NULL,
1045 NULL, NULL, NULL, 0);
1046 (void) sprintf(buffer, "%s_dblk_cache", name);
1047 bcp->dblk_cache = kmem_cache_create(buffer, sizeof (dblk_t),
1048 DBLK_CACHE_ALIGN, bcache_dblk_constructor, bcache_dblk_destructor,
brutus17169042008-05-23 20:14:10 -07001049 NULL, (void *)bcp, NULL, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001050
1051 return (bcp);
1052}
1053
1054void
1055bcache_destroy(bcache_t *bcp)
1056{
1057 ASSERT(bcp != NULL);
1058
1059 mutex_enter(&bcp->mutex);
1060 if (bcp->alloc == 0) {
1061 kmem_cache_destroy(bcp->dblk_cache);
1062 kmem_cache_destroy(bcp->buffer_cache);
1063 mutex_exit(&bcp->mutex);
1064 mutex_destroy(&bcp->mutex);
1065 kmem_free(bcp, sizeof (bcache_t));
1066 } else {
1067 bcp->destroy++;
1068 mutex_exit(&bcp->mutex);
1069 }
1070}
1071
1072/*ARGSUSED*/
1073mblk_t *
1074bcache_allocb(bcache_t *bcp, uint_t pri)
1075{
1076 dblk_t *dbp;
1077 mblk_t *mp = NULL;
1078
1079 ASSERT(bcp != NULL);
1080
1081 mutex_enter(&bcp->mutex);
1082 if (bcp->destroy != 0) {
1083 mutex_exit(&bcp->mutex);
1084 goto out;
1085 }
1086
1087 if ((dbp = kmem_cache_alloc(bcp->dblk_cache, KM_NOSLEEP)) == NULL) {
1088 mutex_exit(&bcp->mutex);
1089 goto out;
1090 }
1091 bcp->alloc++;
1092 mutex_exit(&bcp->mutex);
1093
1094 ASSERT(((uintptr_t)(dbp->db_base) & (bcp->align - 1)) == 0);
1095
1096 mp = dbp->db_mblk;
1097 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1098 mp->b_next = mp->b_prev = mp->b_cont = NULL;
1099 mp->b_rptr = mp->b_wptr = dbp->db_base;
1100 mp->b_queue = NULL;
1101 MBLK_BAND_FLAG_WORD(mp) = 0;
1102 STR_FTALLOC(&dbp->db_fthdr, FTEV_BCALLOCB, bcp->size);
1103out:
1104 FTRACE_1("bcache_allocb(): mp=0x%p", (uintptr_t)mp);
1105
1106 return (mp);
1107}
1108
1109static void
1110dblk_lastfree_oversize(mblk_t *mp, dblk_t *dbp)
1111{
1112 ASSERT(dbp->db_mblk == mp);
1113 if (dbp->db_fthdr != NULL)
1114 str_ftfree(dbp);
1115
1116 /* set credp and projid to be 'unspecified' before returning to cache */
1117 if (dbp->db_credp != NULL) {
1118 crfree(dbp->db_credp);
1119 dbp->db_credp = NULL;
1120 }
1121 dbp->db_cpid = -1;
1122 dbp->db_struioflag = 0;
1123 dbp->db_struioun.cksum.flags = 0;
1124
1125 kmem_free(dbp->db_base, dbp->db_lim - dbp->db_base);
1126 kmem_cache_free(dbp->db_cache, dbp);
1127}
1128
1129static mblk_t *
1130allocb_oversize(size_t size, int kmflags)
1131{
1132 mblk_t *mp;
1133 void *buf;
1134
1135 size = P2ROUNDUP(size, DBLK_CACHE_ALIGN);
1136 if ((buf = kmem_alloc(size, kmflags)) == NULL)
1137 return (NULL);
1138 if ((mp = gesballoc(buf, size, DBLK_RTFU(1, M_DATA, 0, 0),
1139 &frnop, dblk_lastfree_oversize, kmflags)) == NULL)
1140 kmem_free(buf, size);
1141
1142 if (mp != NULL)
1143 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBIG, size);
1144
1145 return (mp);
1146}
1147
1148mblk_t *
1149allocb_tryhard(size_t target_size)
1150{
1151 size_t size;
1152 mblk_t *bp;
1153
1154 for (size = target_size; size < target_size + 512;
1155 size += DBLK_CACHE_ALIGN)
1156 if ((bp = allocb(size, BPRI_HI)) != NULL)
1157 return (bp);
1158 allocb_tryhard_fails++;
1159 return (NULL);
1160}
1161
1162/*
1163 * This routine is consolidation private for STREAMS internal use
1164 * This routine may only be called from sync routines (i.e., not
1165 * from put or service procedures). It is located here (rather
1166 * than strsubr.c) so that we don't have to expose all of the
1167 * allocb() implementation details in header files.
1168 */
1169mblk_t *
1170allocb_wait(size_t size, uint_t pri, uint_t flags, int *error)
1171{
1172 dblk_t *dbp;
1173 mblk_t *mp;
1174 size_t index;
1175
1176 index = (size -1) >> DBLK_SIZE_SHIFT;
1177
1178 if (flags & STR_NOSIG) {
1179 if (index >= (DBLK_MAX_CACHE >> DBLK_SIZE_SHIFT)) {
1180 if (size != 0) {
1181 mp = allocb_oversize(size, KM_SLEEP);
1182 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx",
1183 (uintptr_t)mp);
1184 return (mp);
1185 }
1186 index = 0;
1187 }
1188
1189 dbp = kmem_cache_alloc(dblk_cache[index], KM_SLEEP);
1190 mp = dbp->db_mblk;
1191 DBLK_RTFU_WORD(dbp) = DBLK_RTFU(1, M_DATA, 0, 0);
1192 mp->b_next = mp->b_prev = mp->b_cont = NULL;
1193 mp->b_rptr = mp->b_wptr = dbp->db_base;
1194 mp->b_queue = NULL;
1195 MBLK_BAND_FLAG_WORD(mp) = 0;
1196 STR_FTALLOC(&DB_FTHDR(mp), FTEV_ALLOCBW, size);
1197
1198 FTRACE_1("allocb_wait (NOSIG): mp=0x%lx", (uintptr_t)mp);
1199
1200 } else {
1201 while ((mp = allocb(size, pri)) == NULL) {
1202 if ((*error = strwaitbuf(size, BPRI_HI)) != 0)
1203 return (NULL);
1204 }
1205 }
1206
1207 return (mp);
1208}
1209
1210/*
1211 * Call function 'func' with 'arg' when a class zero block can
1212 * be allocated with priority 'pri'.
1213 */
1214bufcall_id_t
1215esbbcall(uint_t pri, void (*func)(void *), void *arg)
1216{
1217 return (bufcall(1, pri, func, arg));
1218}
1219
1220/*
1221 * Allocates an iocblk (M_IOCTL) block. Properly sets the credentials
1222 * ioc_id, rval and error of the struct ioctl to set up an ioctl call.
1223 * This provides consistency for all internal allocators of ioctl.
1224 */
1225mblk_t *
1226mkiocb(uint_t cmd)
1227{
1228 struct iocblk *ioc;
1229 mblk_t *mp;
1230
1231 /*
1232 * Allocate enough space for any of the ioctl related messages.
1233 */
1234 if ((mp = allocb(sizeof (union ioctypes), BPRI_MED)) == NULL)
1235 return (NULL);
1236
1237 bzero(mp->b_rptr, sizeof (union ioctypes));
1238
1239 /*
1240 * Set the mblk_t information and ptrs correctly.
1241 */
1242 mp->b_wptr += sizeof (struct iocblk);
1243 mp->b_datap->db_type = M_IOCTL;
1244
1245 /*
1246 * Fill in the fields.
1247 */
1248 ioc = (struct iocblk *)mp->b_rptr;
1249 ioc->ioc_cmd = cmd;
1250 ioc->ioc_cr = kcred;
1251 ioc->ioc_id = getiocseqno();
1252 ioc->ioc_flag = IOC_NATIVE;
1253 return (mp);
1254}
1255
1256/*
1257 * test if block of given size can be allocated with a request of
1258 * the given priority.
1259 * 'pri' is no longer used, but is retained for compatibility.
1260 */
1261/* ARGSUSED */
1262int
1263testb(size_t size, uint_t pri)
1264{
1265 return ((size + sizeof (dblk_t)) <= kmem_avail());
1266}
1267
1268/*
1269 * Call function 'func' with argument 'arg' when there is a reasonably
1270 * good chance that a block of size 'size' can be allocated.
1271 * 'pri' is no longer used, but is retained for compatibility.
1272 */
1273/* ARGSUSED */
1274bufcall_id_t
1275bufcall(size_t size, uint_t pri, void (*func)(void *), void *arg)
1276{
1277 static long bid = 1; /* always odd to save checking for zero */
1278 bufcall_id_t bc_id;
1279 struct strbufcall *bcp;
1280
1281 if ((bcp = kmem_alloc(sizeof (strbufcall_t), KM_NOSLEEP)) == NULL)
1282 return (0);
1283
1284 bcp->bc_func = func;
1285 bcp->bc_arg = arg;
1286 bcp->bc_size = size;
1287 bcp->bc_next = NULL;
1288 bcp->bc_executor = NULL;
1289
1290 mutex_enter(&strbcall_lock);
1291 /*
1292 * After bcp is linked into strbcalls and strbcall_lock is dropped there
1293 * should be no references to bcp since it may be freed by
1294 * runbufcalls(). Since bcp_id field is returned, we save its value in
1295 * the local var.
1296 */
1297 bc_id = bcp->bc_id = (bufcall_id_t)(bid += 2); /* keep it odd */
1298
1299 /*
1300 * add newly allocated stream event to existing
1301 * linked list of events.
1302 */
1303 if (strbcalls.bc_head == NULL) {
1304 strbcalls.bc_head = strbcalls.bc_tail = bcp;
1305 } else {
1306 strbcalls.bc_tail->bc_next = bcp;
1307 strbcalls.bc_tail = bcp;
1308 }
1309
1310 cv_signal(&strbcall_cv);
1311 mutex_exit(&strbcall_lock);
1312 return (bc_id);
1313}
1314
1315/*
1316 * Cancel a bufcall request.
1317 */
1318void
1319unbufcall(bufcall_id_t id)
1320{
1321 strbufcall_t *bcp, *pbcp;
1322
1323 mutex_enter(&strbcall_lock);
1324again:
1325 pbcp = NULL;
1326 for (bcp = strbcalls.bc_head; bcp; bcp = bcp->bc_next) {
1327 if (id == bcp->bc_id)
1328 break;
1329 pbcp = bcp;
1330 }
1331 if (bcp) {
1332 if (bcp->bc_executor != NULL) {
1333 if (bcp->bc_executor != curthread) {
1334 cv_wait(&bcall_cv, &strbcall_lock);
1335 goto again;
1336 }
1337 } else {
1338 if (pbcp)
1339 pbcp->bc_next = bcp->bc_next;
1340 else
1341 strbcalls.bc_head = bcp->bc_next;
1342 if (bcp == strbcalls.bc_tail)
1343 strbcalls.bc_tail = pbcp;
1344 kmem_free(bcp, sizeof (strbufcall_t));
1345 }
1346 }
1347 mutex_exit(&strbcall_lock);
1348}
1349
1350/*
1351 * Duplicate a message block by block (uses dupb), returning
1352 * a pointer to the duplicate message.
1353 * Returns a non-NULL value only if the entire message
1354 * was dup'd.
1355 */
1356mblk_t *
1357dupmsg(mblk_t *bp)
1358{
1359 mblk_t *head, *nbp;
1360
1361 if (!bp || !(nbp = head = dupb(bp)))
1362 return (NULL);
1363
1364 while (bp->b_cont) {
1365 if (!(nbp->b_cont = dupb(bp->b_cont))) {
1366 freemsg(head);
1367 return (NULL);
1368 }
1369 nbp = nbp->b_cont;
1370 bp = bp->b_cont;
1371 }
1372 return (head);
1373}
1374
1375#define DUPB_NOLOAN(bp) \
1376 ((((bp)->b_datap->db_struioflag & STRUIO_ZC) != 0) ? \
1377 copyb((bp)) : dupb((bp)))
1378
1379mblk_t *
1380dupmsg_noloan(mblk_t *bp)
1381{
1382 mblk_t *head, *nbp;
1383
1384 if (bp == NULL || DB_TYPE(bp) != M_DATA ||
1385 ((nbp = head = DUPB_NOLOAN(bp)) == NULL))
1386 return (NULL);
1387
1388 while (bp->b_cont) {
1389 if ((nbp->b_cont = DUPB_NOLOAN(bp->b_cont)) == NULL) {
1390 freemsg(head);
1391 return (NULL);
1392 }
1393 nbp = nbp->b_cont;
1394 bp = bp->b_cont;
1395 }
1396 return (head);
1397}
1398
1399/*
1400 * Copy data from message and data block to newly allocated message and
1401 * data block. Returns new message block pointer, or NULL if error.
1402 * The alignment of rptr (w.r.t. word alignment) will be the same in the copy
1403 * as in the original even when db_base is not word aligned. (bug 1052877)
1404 */
1405mblk_t *
1406copyb(mblk_t *bp)
1407{
1408 mblk_t *nbp;
1409 dblk_t *dp, *ndp;
1410 uchar_t *base;
1411 size_t size;
1412 size_t unaligned;
1413
1414 ASSERT(bp->b_wptr >= bp->b_rptr);
1415
1416 dp = bp->b_datap;
1417 if (dp->db_fthdr != NULL)
1418 STR_FTEVENT_MBLK(bp, caller(), FTEV_COPYB, 0);
1419
1420 /*
1421 * Special handling for Multidata message; this should be
1422 * removed once a copy-callback routine is made available.
1423 */
1424 if (dp->db_type == M_MULTIDATA) {
1425 cred_t *cr;
1426
1427 if ((nbp = mmd_copy(bp, KM_NOSLEEP)) == NULL)
1428 return (NULL);
1429
1430 nbp->b_flag = bp->b_flag;
1431 nbp->b_band = bp->b_band;
1432 ndp = nbp->b_datap;
1433
1434 /* See comments below on potential issues. */
1435 STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1436
1437 ASSERT(ndp->db_type == dp->db_type);
1438 cr = dp->db_credp;
1439 if (cr != NULL)
1440 crhold(ndp->db_credp = cr);
1441 ndp->db_cpid = dp->db_cpid;
1442 return (nbp);
1443 }
1444
1445 size = dp->db_lim - dp->db_base;
1446 unaligned = P2PHASE((uintptr_t)dp->db_base, sizeof (uint_t));
1447 if ((nbp = allocb_tmpl(size + unaligned, bp)) == NULL)
1448 return (NULL);
1449 nbp->b_flag = bp->b_flag;
1450 nbp->b_band = bp->b_band;
1451 ndp = nbp->b_datap;
1452
1453 /*
1454 * Well, here is a potential issue. If we are trying to
1455 * trace a flow, and we copy the message, we might lose
1456 * information about where this message might have been.
1457 * So we should inherit the FT data. On the other hand,
1458 * a user might be interested only in alloc to free data.
1459 * So I guess the real answer is to provide a tunable.
1460 */
1461 STR_FTEVENT_MBLK(nbp, caller(), FTEV_COPYB, 1);
1462
1463 base = ndp->db_base + unaligned;
1464 bcopy(dp->db_base, ndp->db_base + unaligned, size);
1465
1466 nbp->b_rptr = base + (bp->b_rptr - dp->db_base);
1467 nbp->b_wptr = nbp->b_rptr + MBLKL(bp);
1468
1469 return (nbp);
1470}
1471
1472/*
1473 * Copy data from message to newly allocated message using new
1474 * data blocks. Returns a pointer to the new message, or NULL if error.
1475 */
1476mblk_t *
1477copymsg(mblk_t *bp)
1478{
1479 mblk_t *head, *nbp;
1480
1481 if (!bp || !(nbp = head = copyb(bp)))
1482 return (NULL);
1483
1484 while (bp->b_cont) {
1485 if (!(nbp->b_cont = copyb(bp->b_cont))) {
1486 freemsg(head);
1487 return (NULL);
1488 }
1489 nbp = nbp->b_cont;
1490 bp = bp->b_cont;
1491 }
1492 return (head);
1493}
1494
1495/*
1496 * link a message block to tail of message
1497 */
1498void
1499linkb(mblk_t *mp, mblk_t *bp)
1500{
1501 ASSERT(mp && bp);
1502
1503 for (; mp->b_cont; mp = mp->b_cont)
1504 ;
1505 mp->b_cont = bp;
1506}
1507
1508/*
1509 * unlink a message block from head of message
1510 * return pointer to new message.
1511 * NULL if message becomes empty.
1512 */
1513mblk_t *
1514unlinkb(mblk_t *bp)
1515{
1516 mblk_t *bp1;
1517
1518 bp1 = bp->b_cont;
1519 bp->b_cont = NULL;
1520 return (bp1);
1521}
1522
1523/*
1524 * remove a message block "bp" from message "mp"
1525 *
1526 * Return pointer to new message or NULL if no message remains.
1527 * Return -1 if bp is not found in message.
1528 */
1529mblk_t *
1530rmvb(mblk_t *mp, mblk_t *bp)
1531{
1532 mblk_t *tmp;
1533 mblk_t *lastp = NULL;
1534
1535 ASSERT(mp && bp);
1536 for (tmp = mp; tmp; tmp = tmp->b_cont) {
1537 if (tmp == bp) {
1538 if (lastp)
1539 lastp->b_cont = tmp->b_cont;
1540 else
1541 mp = tmp->b_cont;
1542 tmp->b_cont = NULL;
1543 return (mp);
1544 }
1545 lastp = tmp;
1546 }
1547 return ((mblk_t *)-1);
1548}
1549
1550/*
1551 * Concatenate and align first len bytes of common
1552 * message type. Len == -1, means concat everything.
1553 * Returns 1 on success, 0 on failure
1554 * After the pullup, mp points to the pulled up data.
1555 */
1556int
1557pullupmsg(mblk_t *mp, ssize_t len)
1558{
1559 mblk_t *bp, *b_cont;
1560 dblk_t *dbp;
1561 ssize_t n;
1562
1563 ASSERT(mp->b_datap->db_ref > 0);
1564 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1565
1566 /*
1567 * We won't handle Multidata message, since it contains
1568 * metadata which this function has no knowledge of; we
1569 * assert on DEBUG, and return failure otherwise.
1570 */
1571 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1572 if (mp->b_datap->db_type == M_MULTIDATA)
1573 return (0);
1574
1575 if (len == -1) {
1576 if (mp->b_cont == NULL && str_aligned(mp->b_rptr))
1577 return (1);
1578 len = xmsgsize(mp);
1579 } else {
1580 ssize_t first_mblk_len = mp->b_wptr - mp->b_rptr;
1581 ASSERT(first_mblk_len >= 0);
1582 /*
1583 * If the length is less than that of the first mblk,
1584 * we want to pull up the message into an aligned mblk.
1585 * Though not part of the spec, some callers assume it.
1586 */
1587 if (len <= first_mblk_len) {
1588 if (str_aligned(mp->b_rptr))
1589 return (1);
1590 len = first_mblk_len;
1591 } else if (xmsgsize(mp) < len)
1592 return (0);
1593 }
1594
1595 if ((bp = allocb_tmpl(len, mp)) == NULL)
1596 return (0);
1597
1598 dbp = bp->b_datap;
1599 *bp = *mp; /* swap mblks so bp heads the old msg... */
1600 mp->b_datap = dbp; /* ... and mp heads the new message */
1601 mp->b_datap->db_mblk = mp;
1602 bp->b_datap->db_mblk = bp;
1603 mp->b_rptr = mp->b_wptr = dbp->db_base;
1604
1605 do {
1606 ASSERT(bp->b_datap->db_ref > 0);
1607 ASSERT(bp->b_wptr >= bp->b_rptr);
1608 n = MIN(bp->b_wptr - bp->b_rptr, len);
Erik Nordmarkbd670b32009-11-11 11:49:49 -08001609 ASSERT(n >= 0); /* allow zero-length mblk_t's */
1610 if (n > 0)
1611 bcopy(bp->b_rptr, mp->b_wptr, (size_t)n);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001612 mp->b_wptr += n;
1613 bp->b_rptr += n;
1614 len -= n;
1615 if (bp->b_rptr != bp->b_wptr)
1616 break;
1617 b_cont = bp->b_cont;
1618 freeb(bp);
1619 bp = b_cont;
1620 } while (len && bp);
1621
1622 mp->b_cont = bp; /* tack on whatever wasn't pulled up */
1623
1624 return (1);
1625}
1626
1627/*
1628 * Concatenate and align at least the first len bytes of common message
1629 * type. Len == -1 means concatenate everything. The original message is
1630 * unaltered. Returns a pointer to a new message on success, otherwise
1631 * returns NULL.
1632 */
1633mblk_t *
1634msgpullup(mblk_t *mp, ssize_t len)
1635{
1636 mblk_t *newmp;
1637 ssize_t totlen;
1638 ssize_t n;
1639
1640 /*
1641 * We won't handle Multidata message, since it contains
1642 * metadata which this function has no knowledge of; we
1643 * assert on DEBUG, and return failure otherwise.
1644 */
1645 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1646 if (mp->b_datap->db_type == M_MULTIDATA)
1647 return (NULL);
1648
1649 totlen = xmsgsize(mp);
1650
1651 if ((len > 0) && (len > totlen))
1652 return (NULL);
1653
1654 /*
1655 * Copy all of the first msg type into one new mblk, then dupmsg
1656 * and link the rest onto this.
1657 */
1658
1659 len = totlen;
1660
1661 if ((newmp = allocb_tmpl(len, mp)) == NULL)
1662 return (NULL);
1663
1664 newmp->b_flag = mp->b_flag;
1665 newmp->b_band = mp->b_band;
1666
1667 while (len > 0) {
1668 n = mp->b_wptr - mp->b_rptr;
1669 ASSERT(n >= 0); /* allow zero-length mblk_t's */
1670 if (n > 0)
1671 bcopy(mp->b_rptr, newmp->b_wptr, n);
1672 newmp->b_wptr += n;
1673 len -= n;
1674 mp = mp->b_cont;
1675 }
1676
1677 if (mp != NULL) {
1678 newmp->b_cont = dupmsg(mp);
1679 if (newmp->b_cont == NULL) {
1680 freemsg(newmp);
1681 return (NULL);
1682 }
1683 }
1684
1685 return (newmp);
1686}
1687
1688/*
1689 * Trim bytes from message
1690 * len > 0, trim from head
1691 * len < 0, trim from tail
1692 * Returns 1 on success, 0 on failure.
1693 */
1694int
1695adjmsg(mblk_t *mp, ssize_t len)
1696{
1697 mblk_t *bp;
1698 mblk_t *save_bp = NULL;
1699 mblk_t *prev_bp;
1700 mblk_t *bcont;
1701 unsigned char type;
1702 ssize_t n;
1703 int fromhead;
1704 int first;
1705
1706 ASSERT(mp != NULL);
1707 /*
1708 * We won't handle Multidata message, since it contains
1709 * metadata which this function has no knowledge of; we
1710 * assert on DEBUG, and return failure otherwise.
1711 */
1712 ASSERT(mp->b_datap->db_type != M_MULTIDATA);
1713 if (mp->b_datap->db_type == M_MULTIDATA)
1714 return (0);
1715
1716 if (len < 0) {
1717 fromhead = 0;
1718 len = -len;
1719 } else {
1720 fromhead = 1;
1721 }
1722
1723 if (xmsgsize(mp) < len)
1724 return (0);
1725
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001726 if (fromhead) {
1727 first = 1;
1728 while (len) {
1729 ASSERT(mp->b_wptr >= mp->b_rptr);
1730 n = MIN(mp->b_wptr - mp->b_rptr, len);
1731 mp->b_rptr += n;
1732 len -= n;
1733
1734 /*
1735 * If this is not the first zero length
1736 * message remove it
1737 */
1738 if (!first && (mp->b_wptr == mp->b_rptr)) {
1739 bcont = mp->b_cont;
1740 freeb(mp);
1741 mp = save_bp->b_cont = bcont;
1742 } else {
1743 save_bp = mp;
1744 mp = mp->b_cont;
1745 }
1746 first = 0;
1747 }
1748 } else {
1749 type = mp->b_datap->db_type;
1750 while (len) {
1751 bp = mp;
1752 save_bp = NULL;
1753
1754 /*
1755 * Find the last message of same type
1756 */
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001757 while (bp && bp->b_datap->db_type == type) {
1758 ASSERT(bp->b_wptr >= bp->b_rptr);
1759 prev_bp = save_bp;
1760 save_bp = bp;
1761 bp = bp->b_cont;
1762 }
1763 if (save_bp == NULL)
1764 break;
1765 n = MIN(save_bp->b_wptr - save_bp->b_rptr, len);
1766 save_bp->b_wptr -= n;
1767 len -= n;
1768
1769 /*
1770 * If this is not the first message
1771 * and we have taken away everything
1772 * from this message, remove it
1773 */
1774
1775 if ((save_bp != mp) &&
brutus17169042008-05-23 20:14:10 -07001776 (save_bp->b_wptr == save_bp->b_rptr)) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001777 bcont = save_bp->b_cont;
1778 freeb(save_bp);
1779 prev_bp->b_cont = bcont;
1780 }
1781 }
1782 }
1783 return (1);
1784}
1785
1786/*
1787 * get number of data bytes in message
1788 */
1789size_t
1790msgdsize(mblk_t *bp)
1791{
1792 size_t count = 0;
1793
1794 for (; bp; bp = bp->b_cont)
1795 if (bp->b_datap->db_type == M_DATA) {
1796 ASSERT(bp->b_wptr >= bp->b_rptr);
1797 count += bp->b_wptr - bp->b_rptr;
1798 }
1799 return (count);
1800}
1801
1802/*
1803 * Get a message off head of queue
1804 *
1805 * If queue has no buffers then mark queue
1806 * with QWANTR. (queue wants to be read by
1807 * someone when data becomes available)
1808 *
1809 * If there is something to take off then do so.
1810 * If queue falls below hi water mark turn off QFULL
1811 * flag. Decrement weighted count of queue.
1812 * Also turn off QWANTR because queue is being read.
1813 *
1814 * The queue count is maintained on a per-band basis.
1815 * Priority band 0 (normal messages) uses q_count,
1816 * q_lowat, etc. Non-zero priority bands use the
1817 * fields in their respective qband structures
1818 * (qb_count, qb_lowat, etc.) All messages appear
1819 * on the same list, linked via their b_next pointers.
1820 * q_first is the head of the list. q_count does
1821 * not reflect the size of all the messages on the
1822 * queue. It only reflects those messages in the
1823 * normal band of flow. The one exception to this
1824 * deals with high priority messages. They are in
1825 * their own conceptual "band", but are accounted
1826 * against q_count.
1827 *
1828 * If queue count is below the lo water mark and QWANTW
1829 * is set, enable the closest backq which has a service
1830 * procedure and turn off the QWANTW flag.
1831 *
1832 * getq could be built on top of rmvq, but isn't because
1833 * of performance considerations.
1834 *
1835 * A note on the use of q_count and q_mblkcnt:
1836 * q_count is the traditional byte count for messages that
1837 * have been put on a queue. Documentation tells us that
1838 * we shouldn't rely on that count, but some drivers/modules
1839 * do. What was needed, however, is a mechanism to prevent
1840 * runaway streams from consuming all of the resources,
1841 * and particularly be able to flow control zero-length
1842 * messages. q_mblkcnt is used for this purpose. It
1843 * counts the number of mblk's that are being put on
1844 * the queue. The intention here, is that each mblk should
1845 * contain one byte of data and, for the purpose of
1846 * flow-control, logically does. A queue will become
1847 * full when EITHER of these values (q_count and q_mblkcnt)
1848 * reach the highwater mark. It will clear when BOTH
1849 * of them drop below the highwater mark. And it will
1850 * backenable when BOTH of them drop below the lowwater
1851 * mark.
1852 * With this algorithm, a driver/module might be able
1853 * to find a reasonably accurate q_count, and the
1854 * framework can still try and limit resource usage.
1855 */
1856mblk_t *
1857getq(queue_t *q)
1858{
1859 mblk_t *bp;
micheng116094b2005-07-26 01:08:29 -07001860 uchar_t band = 0;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001861
ja97890301ce412008-06-02 07:48:31 -07001862 bp = getq_noenab(q, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001863 if (bp != NULL)
1864 band = bp->b_band;
1865
1866 /*
1867 * Inlined from qbackenable().
1868 * Quick check without holding the lock.
1869 */
1870 if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
1871 return (bp);
1872
1873 qbackenable(q, band);
1874 return (bp);
1875}
1876
1877/*
masputraff550d02005-10-22 22:50:14 -07001878 * Calculate number of data bytes in a single data message block taking
1879 * multidata messages into account.
1880 */
1881
1882#define ADD_MBLK_SIZE(mp, size) \
1883 if (DB_TYPE(mp) != M_MULTIDATA) { \
1884 (size) += MBLKL(mp); \
1885 } else { \
1886 uint_t pinuse; \
1887 \
1888 mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse); \
1889 (size) += pinuse; \
1890 }
1891
1892/*
ja97890301ce412008-06-02 07:48:31 -07001893 * Returns the number of bytes in a message (a message is defined as a
1894 * chain of mblks linked by b_cont). If a non-NULL mblkcnt is supplied we
1895 * also return the number of distinct mblks in the message.
1896 */
1897int
1898mp_cont_len(mblk_t *bp, int *mblkcnt)
1899{
1900 mblk_t *mp;
1901 int mblks = 0;
1902 int bytes = 0;
1903
1904 for (mp = bp; mp != NULL; mp = mp->b_cont) {
1905 ADD_MBLK_SIZE(mp, bytes);
1906 mblks++;
1907 }
1908
1909 if (mblkcnt != NULL)
1910 *mblkcnt = mblks;
1911
1912 return (bytes);
1913}
1914
1915/*
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001916 * Like getq() but does not backenable. This is used by the stream
1917 * head when a putback() is likely. The caller must call qbackenable()
1918 * after it is done with accessing the queue.
ja97890301ce412008-06-02 07:48:31 -07001919 * The rbytes arguments to getq_noneab() allows callers to specify a
1920 * the maximum number of bytes to return. If the current amount on the
1921 * queue is less than this then the entire message will be returned.
1922 * A value of 0 returns the entire message and is equivalent to the old
1923 * default behaviour prior to the addition of the rbytes argument.
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001924 */
1925mblk_t *
ja97890301ce412008-06-02 07:48:31 -07001926getq_noenab(queue_t *q, ssize_t rbytes)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001927{
ja97890301ce412008-06-02 07:48:31 -07001928 mblk_t *bp, *mp1;
1929 mblk_t *mp2 = NULL;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07001930 qband_t *qbp;
1931 kthread_id_t freezer;
1932 int bytecnt = 0, mblkcnt = 0;
1933
1934 /* freezestr should allow its caller to call getq/putq */
1935 freezer = STREAM(q)->sd_freezer;
1936 if (freezer == curthread) {
1937 ASSERT(frozenstr(q));
1938 ASSERT(MUTEX_HELD(QLOCK(q)));
1939 } else
1940 mutex_enter(QLOCK(q));
1941
1942 if ((bp = q->q_first) == 0) {
1943 q->q_flag |= QWANTR;
1944 } else {
ja97890301ce412008-06-02 07:48:31 -07001945 /*
1946 * If the caller supplied a byte threshold and there is
1947 * more than this amount on the queue then break up the
1948 * the message appropriately. We can only safely do
1949 * this for M_DATA messages.
1950 */
1951 if ((DB_TYPE(bp) == M_DATA) && (rbytes > 0) &&
1952 (q->q_count > rbytes)) {
1953 /*
1954 * Inline version of mp_cont_len() which terminates
1955 * when we meet or exceed rbytes.
1956 */
1957 for (mp1 = bp; mp1 != NULL; mp1 = mp1->b_cont) {
1958 mblkcnt++;
1959 ADD_MBLK_SIZE(mp1, bytecnt);
1960 if (bytecnt >= rbytes)
1961 break;
1962 }
1963 /*
1964 * We need to account for the following scenarios:
1965 *
1966 * 1) Too much data in the first message:
1967 * mp1 will be the mblk which puts us over our
1968 * byte limit.
1969 * 2) Not enough data in the first message:
1970 * mp1 will be NULL.
1971 * 3) Exactly the right amount of data contained within
1972 * whole mblks:
1973 * mp1->b_cont will be where we break the message.
1974 */
1975 if (bytecnt > rbytes) {
1976 /*
1977 * Dup/copy mp1 and put what we don't need
1978 * back onto the queue. Adjust the read/write
1979 * and continuation pointers appropriately
1980 * and decrement the current mblk count to
1981 * reflect we are putting an mblk back onto
1982 * the queue.
1983 * When adjusting the message pointers, it's
1984 * OK to use the existing bytecnt and the
1985 * requested amount (rbytes) to calculate the
1986 * the new write offset (b_wptr) of what we
1987 * are taking. However, we cannot use these
1988 * values when calculating the read offset of
1989 * the mblk we are putting back on the queue.
1990 * This is because the begining (b_rptr) of the
1991 * mblk represents some arbitrary point within
1992 * the message.
1993 * It's simplest to do this by advancing b_rptr
1994 * by the new length of mp1 as we don't have to
1995 * remember any intermediate state.
1996 */
1997 ASSERT(mp1 != NULL);
1998 mblkcnt--;
1999 if ((mp2 = dupb(mp1)) == NULL &&
2000 (mp2 = copyb(mp1)) == NULL) {
2001 bytecnt = mblkcnt = 0;
2002 goto dup_failed;
2003 }
2004 mp2->b_cont = mp1->b_cont;
2005 mp1->b_wptr -= bytecnt - rbytes;
2006 mp2->b_rptr += mp1->b_wptr - mp1->b_rptr;
2007 mp1->b_cont = NULL;
2008 bytecnt = rbytes;
2009 } else {
2010 /*
2011 * Either there is not enough data in the first
2012 * message or there is no excess data to deal
2013 * with. If mp1 is NULL, we are taking the
2014 * whole message. No need to do anything.
2015 * Otherwise we assign mp1->b_cont to mp2 as
2016 * we will be putting this back onto the head of
2017 * the queue.
2018 */
2019 if (mp1 != NULL) {
2020 mp2 = mp1->b_cont;
2021 mp1->b_cont = NULL;
2022 }
2023 }
2024 /*
2025 * If mp2 is not NULL then we have part of the message
2026 * to put back onto the queue.
2027 */
2028 if (mp2 != NULL) {
2029 if ((mp2->b_next = bp->b_next) == NULL)
2030 q->q_last = mp2;
2031 else
2032 bp->b_next->b_prev = mp2;
2033 q->q_first = mp2;
2034 } else {
2035 if ((q->q_first = bp->b_next) == NULL)
2036 q->q_last = NULL;
2037 else
2038 q->q_first->b_prev = NULL;
2039 }
2040 } else {
2041 /*
2042 * Either no byte threshold was supplied, there is
2043 * not enough on the queue or we failed to
2044 * duplicate/copy a data block. In these cases we
2045 * just take the entire first message.
2046 */
2047dup_failed:
2048 bytecnt = mp_cont_len(bp, &mblkcnt);
2049 if ((q->q_first = bp->b_next) == NULL)
2050 q->q_last = NULL;
2051 else
2052 q->q_first->b_prev = NULL;
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002053 }
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002054 if (bp->b_band == 0) {
2055 q->q_count -= bytecnt;
2056 q->q_mblkcnt -= mblkcnt;
rk129064ba464302007-10-29 14:07:16 -07002057 if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
2058 (q->q_mblkcnt < q->q_hiwat))) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002059 q->q_flag &= ~QFULL;
2060 }
2061 } else {
2062 int i;
2063
2064 ASSERT(bp->b_band <= q->q_nband);
2065 ASSERT(q->q_bandp != NULL);
2066 ASSERT(MUTEX_HELD(QLOCK(q)));
2067 qbp = q->q_bandp;
2068 i = bp->b_band;
2069 while (--i > 0)
2070 qbp = qbp->qb_next;
2071 if (qbp->qb_first == qbp->qb_last) {
2072 qbp->qb_first = NULL;
2073 qbp->qb_last = NULL;
2074 } else {
2075 qbp->qb_first = bp->b_next;
2076 }
2077 qbp->qb_count -= bytecnt;
2078 qbp->qb_mblkcnt -= mblkcnt;
rk129064ba464302007-10-29 14:07:16 -07002079 if (qbp->qb_mblkcnt == 0 ||
2080 ((qbp->qb_count < qbp->qb_hiwat) &&
2081 (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002082 qbp->qb_flag &= ~QB_FULL;
2083 }
2084 }
2085 q->q_flag &= ~QWANTR;
2086 bp->b_next = NULL;
2087 bp->b_prev = NULL;
2088 }
2089 if (freezer != curthread)
2090 mutex_exit(QLOCK(q));
2091
2092 STR_FTEVENT_MSG(bp, q, FTEV_GETQ, NULL);
2093
2094 return (bp);
2095}
2096
2097/*
2098 * Determine if a backenable is needed after removing a message in the
2099 * specified band.
2100 * NOTE: This routine assumes that something like getq_noenab() has been
2101 * already called.
2102 *
2103 * For the read side it is ok to hold sd_lock across calling this (and the
2104 * stream head often does).
2105 * But for the write side strwakeq might be invoked and it acquires sd_lock.
2106 */
2107void
micheng116094b2005-07-26 01:08:29 -07002108qbackenable(queue_t *q, uchar_t band)
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002109{
2110 int backenab = 0;
2111 qband_t *qbp;
2112 kthread_id_t freezer;
2113
2114 ASSERT(q);
2115 ASSERT((q->q_flag & QREADR) || MUTEX_NOT_HELD(&STREAM(q)->sd_lock));
2116
2117 /*
2118 * Quick check without holding the lock.
2119 * OK since after getq() has lowered the q_count these flags
2120 * would not change unless either the qbackenable() is done by
2121 * another thread (which is ok) or the queue has gotten QFULL
2122 * in which case another backenable will take place when the queue
2123 * drops below q_lowat.
2124 */
2125 if (band == 0 && (q->q_flag & (QWANTW|QWANTWSYNC)) == 0)
2126 return;
2127
2128 /* freezestr should allow its caller to call getq/putq */
2129 freezer = STREAM(q)->sd_freezer;
2130 if (freezer == curthread) {
2131 ASSERT(frozenstr(q));
2132 ASSERT(MUTEX_HELD(QLOCK(q)));
2133 } else
2134 mutex_enter(QLOCK(q));
2135
2136 if (band == 0) {
2137 if (q->q_lowat == 0 || (q->q_count < q->q_lowat &&
2138 q->q_mblkcnt < q->q_lowat)) {
2139 backenab = q->q_flag & (QWANTW|QWANTWSYNC);
2140 }
2141 } else {
2142 int i;
2143
2144 ASSERT((unsigned)band <= q->q_nband);
2145 ASSERT(q->q_bandp != NULL);
2146
2147 qbp = q->q_bandp;
2148 i = band;
2149 while (--i > 0)
2150 qbp = qbp->qb_next;
2151
2152 if (qbp->qb_lowat == 0 || (qbp->qb_count < qbp->qb_lowat &&
2153 qbp->qb_mblkcnt < qbp->qb_lowat)) {
2154 backenab = qbp->qb_flag & QB_WANTW;
2155 }
2156 }
2157
2158 if (backenab == 0) {
2159 if (freezer != curthread)
2160 mutex_exit(QLOCK(q));
2161 return;
2162 }
2163
2164 /* Have to drop the lock across strwakeq and backenable */
2165 if (backenab & QWANTWSYNC)
2166 q->q_flag &= ~QWANTWSYNC;
2167 if (backenab & (QWANTW|QB_WANTW)) {
2168 if (band != 0)
2169 qbp->qb_flag &= ~QB_WANTW;
2170 else {
2171 q->q_flag &= ~QWANTW;
2172 }
2173 }
2174
2175 if (freezer != curthread)
2176 mutex_exit(QLOCK(q));
2177
2178 if (backenab & QWANTWSYNC)
2179 strwakeq(q, QWANTWSYNC);
2180 if (backenab & (QWANTW|QB_WANTW))
2181 backenable(q, band);
2182}
2183
2184/*
2185 * Remove a message from a queue. The queue count and other
2186 * flow control parameters are adjusted and the back queue
2187 * enabled if necessary.
2188 *
2189 * rmvq can be called with the stream frozen, but other utility functions
2190 * holding QLOCK, and by streams modules without any locks/frozen.
2191 */
2192void
2193rmvq(queue_t *q, mblk_t *mp)
2194{
2195 ASSERT(mp != NULL);
2196
2197 rmvq_noenab(q, mp);
2198 if (curthread != STREAM(q)->sd_freezer && MUTEX_HELD(QLOCK(q))) {
2199 /*
2200 * qbackenable can handle a frozen stream but not a "random"
2201 * qlock being held. Drop lock across qbackenable.
2202 */
2203 mutex_exit(QLOCK(q));
2204 qbackenable(q, mp->b_band);
2205 mutex_enter(QLOCK(q));
2206 } else {
2207 qbackenable(q, mp->b_band);
2208 }
2209}
2210
2211/*
2212 * Like rmvq() but without any backenabling.
2213 * This exists to handle SR_CONSOL_DATA in strrput().
2214 */
2215void
2216rmvq_noenab(queue_t *q, mblk_t *mp)
2217{
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002218 int i;
2219 qband_t *qbp = NULL;
2220 kthread_id_t freezer;
2221 int bytecnt = 0, mblkcnt = 0;
2222
2223 freezer = STREAM(q)->sd_freezer;
2224 if (freezer == curthread) {
2225 ASSERT(frozenstr(q));
2226 ASSERT(MUTEX_HELD(QLOCK(q)));
2227 } else if (MUTEX_HELD(QLOCK(q))) {
2228 /* Don't drop lock on exit */
2229 freezer = curthread;
2230 } else
2231 mutex_enter(QLOCK(q));
2232
2233 ASSERT(mp->b_band <= q->q_nband);
2234 if (mp->b_band != 0) { /* Adjust band pointers */
2235 ASSERT(q->q_bandp != NULL);
2236 qbp = q->q_bandp;
2237 i = mp->b_band;
2238 while (--i > 0)
2239 qbp = qbp->qb_next;
2240 if (mp == qbp->qb_first) {
2241 if (mp->b_next && mp->b_band == mp->b_next->b_band)
2242 qbp->qb_first = mp->b_next;
2243 else
2244 qbp->qb_first = NULL;
2245 }
2246 if (mp == qbp->qb_last) {
2247 if (mp->b_prev && mp->b_band == mp->b_prev->b_band)
2248 qbp->qb_last = mp->b_prev;
2249 else
2250 qbp->qb_last = NULL;
2251 }
2252 }
2253
2254 /*
2255 * Remove the message from the list.
2256 */
2257 if (mp->b_prev)
2258 mp->b_prev->b_next = mp->b_next;
2259 else
2260 q->q_first = mp->b_next;
2261 if (mp->b_next)
2262 mp->b_next->b_prev = mp->b_prev;
2263 else
2264 q->q_last = mp->b_prev;
2265 mp->b_next = NULL;
2266 mp->b_prev = NULL;
2267
2268 /* Get the size of the message for q_count accounting */
ja97890301ce412008-06-02 07:48:31 -07002269 bytecnt = mp_cont_len(mp, &mblkcnt);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002270
2271 if (mp->b_band == 0) { /* Perform q_count accounting */
2272 q->q_count -= bytecnt;
2273 q->q_mblkcnt -= mblkcnt;
rk129064ba464302007-10-29 14:07:16 -07002274 if (q->q_mblkcnt == 0 || ((q->q_count < q->q_hiwat) &&
2275 (q->q_mblkcnt < q->q_hiwat))) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002276 q->q_flag &= ~QFULL;
2277 }
2278 } else { /* Perform qb_count accounting */
2279 qbp->qb_count -= bytecnt;
2280 qbp->qb_mblkcnt -= mblkcnt;
rk129064ba464302007-10-29 14:07:16 -07002281 if (qbp->qb_mblkcnt == 0 || ((qbp->qb_count < qbp->qb_hiwat) &&
2282 (qbp->qb_mblkcnt < qbp->qb_hiwat))) {
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002283 qbp->qb_flag &= ~QB_FULL;
2284 }
2285 }
2286 if (freezer != curthread)
2287 mutex_exit(QLOCK(q));
2288
2289 STR_FTEVENT_MSG(mp, q, FTEV_RMVQ, NULL);
2290}
2291
2292/*
2293 * Empty a queue.
2294 * If flag is set, remove all messages. Otherwise, remove
2295 * only non-control messages. If queue falls below its low
2296 * water mark, and QWANTW is set, enable the nearest upstream
2297 * service procedure.
2298 *
2299 * Historical note: when merging the M_FLUSH code in strrput with this
2300 * code one difference was discovered. flushq did not have a check
2301 * for q_lowat == 0 in the backenabling test.
2302 *
2303 * pcproto_flag specifies whether or not a M_PCPROTO message should be flushed
2304 * if one exists on the queue.
2305 */
2306void
2307flushq_common(queue_t *q, int flag, int pcproto_flag)
2308{
2309 mblk_t *mp, *nmp;
2310 qband_t *qbp;
2311 int backenab = 0;
2312 unsigned char bpri;
2313 unsigned char qbf[NBAND]; /* band flushing backenable flags */
2314
2315 if (q->q_first == NULL)
2316 return;
2317
2318 mutex_enter(QLOCK(q));
2319 mp = q->q_first;
2320 q->q_first = NULL;
2321 q->q_last = NULL;
2322 q->q_count = 0;
2323 q->q_mblkcnt = 0;
2324 for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2325 qbp->qb_first = NULL;
2326 qbp->qb_last = NULL;
2327 qbp->qb_count = 0;
2328 qbp->qb_mblkcnt = 0;
2329 qbp->qb_flag &= ~QB_FULL;
2330 }
2331 q->q_flag &= ~QFULL;
2332 mutex_exit(QLOCK(q));
2333 while (mp) {
2334 nmp = mp->b_next;
2335 mp->b_next = mp->b_prev = NULL;
2336
2337 STR_FTEVENT_MBLK(mp, q, FTEV_FLUSHQ, NULL);
2338
2339 if (pcproto_flag && (mp->b_datap->db_type == M_PCPROTO))
2340 (void) putq(q, mp);
2341 else if (flag || datamsg(mp->b_datap->db_type))
2342 freemsg(mp);
2343 else
2344 (void) putq(q, mp);
2345 mp = nmp;
2346 }
2347 bpri = 1;
2348 mutex_enter(QLOCK(q));
2349 for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2350 if ((qbp->qb_flag & QB_WANTW) &&
2351 (((qbp->qb_count < qbp->qb_lowat) &&
2352 (qbp->qb_mblkcnt < qbp->qb_lowat)) ||
2353 qbp->qb_lowat == 0)) {
2354 qbp->qb_flag &= ~QB_WANTW;
2355 backenab = 1;
2356 qbf[bpri] = 1;
2357 } else
2358 qbf[bpri] = 0;
2359 bpri++;
2360 }
2361 ASSERT(bpri == (unsigned char)(q->q_nband + 1));
2362 if ((q->q_flag & QWANTW) &&
2363 (((q->q_count < q->q_lowat) &&
2364 (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
2365 q->q_flag &= ~QWANTW;
2366 backenab = 1;
2367 qbf[0] = 1;
2368 } else
2369 qbf[0] = 0;
2370
2371 /*
2372 * If any band can now be written to, and there is a writer
2373 * for that band, then backenable the closest service procedure.
2374 */
2375 if (backenab) {
2376 mutex_exit(QLOCK(q));
2377 for (bpri = q->q_nband; bpri != 0; bpri--)
2378 if (qbf[bpri])
micheng116094b2005-07-26 01:08:29 -07002379 backenable(q, bpri);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002380 if (qbf[0])
2381 backenable(q, 0);
2382 } else
2383 mutex_exit(QLOCK(q));
2384}
2385
2386/*
2387 * The real flushing takes place in flushq_common. This is done so that
2388 * a flag which specifies whether or not M_PCPROTO messages should be flushed
2389 * or not. Currently the only place that uses this flag is the stream head.
2390 */
2391void
2392flushq(queue_t *q, int flag)
2393{
2394 flushq_common(q, flag, 0);
2395}
2396
2397/*
2398 * Flush the queue of messages of the given priority band.
2399 * There is some duplication of code between flushq and flushband.
2400 * This is because we want to optimize the code as much as possible.
2401 * The assumption is that there will be more messages in the normal
2402 * (priority 0) band than in any other.
2403 *
2404 * Historical note: when merging the M_FLUSH code in strrput with this
2405 * code one difference was discovered. flushband had an extra check for
2406 * did not have a check for (mp->b_datap->db_type < QPCTL) in the band 0
2407 * case. That check does not match the man page for flushband and was not
2408 * in the strrput flush code hence it was removed.
2409 */
2410void
2411flushband(queue_t *q, unsigned char pri, int flag)
2412{
2413 mblk_t *mp;
2414 mblk_t *nmp;
2415 mblk_t *last;
2416 qband_t *qbp;
2417 int band;
2418
2419 ASSERT((flag == FLUSHDATA) || (flag == FLUSHALL));
2420 if (pri > q->q_nband) {
2421 return;
2422 }
2423 mutex_enter(QLOCK(q));
2424 if (pri == 0) {
2425 mp = q->q_first;
2426 q->q_first = NULL;
2427 q->q_last = NULL;
2428 q->q_count = 0;
2429 q->q_mblkcnt = 0;
2430 for (qbp = q->q_bandp; qbp; qbp = qbp->qb_next) {
2431 qbp->qb_first = NULL;
2432 qbp->qb_last = NULL;
2433 qbp->qb_count = 0;
2434 qbp->qb_mblkcnt = 0;
2435 qbp->qb_flag &= ~QB_FULL;
2436 }
2437 q->q_flag &= ~QFULL;
2438 mutex_exit(QLOCK(q));
2439 while (mp) {
2440 nmp = mp->b_next;
2441 mp->b_next = mp->b_prev = NULL;
2442 if ((mp->b_band == 0) &&
brutus17169042008-05-23 20:14:10 -07002443 ((flag == FLUSHALL) ||
2444 datamsg(mp->b_datap->db_type)))
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002445 freemsg(mp);
2446 else
2447 (void) putq(q, mp);
2448 mp = nmp;
2449 }
2450 mutex_enter(QLOCK(q));
2451 if ((q->q_flag & QWANTW) &&
2452 (((q->q_count < q->q_lowat) &&
2453 (q->q_mblkcnt < q->q_lowat)) || q->q_lowat == 0)) {
2454 q->q_flag &= ~QWANTW;
2455 mutex_exit(QLOCK(q));
2456
micheng116094b2005-07-26 01:08:29 -07002457 backenable(q, pri);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002458 } else
2459 mutex_exit(QLOCK(q));
2460 } else { /* pri != 0 */
2461 boolean_t flushed = B_FALSE;
2462 band = pri;
2463
2464 ASSERT(MUTEX_HELD(QLOCK(q)));
2465 qbp = q->q_bandp;
2466 while (--band > 0)
2467 qbp = qbp->qb_next;
2468 mp = qbp->qb_first;
2469 if (mp == NULL) {
2470 mutex_exit(QLOCK(q));
2471 return;
2472 }
2473 last = qbp->qb_last->b_next;
2474 /*
2475 * rmvq_noenab() and freemsg() are called for each mblk that
2476 * meets the criteria. The loop is executed until the last
2477 * mblk has been processed.
2478 */
2479 while (mp != last) {
2480 ASSERT(mp->b_band == pri);
2481 nmp = mp->b_next;
2482 if (flag == FLUSHALL || datamsg(mp->b_datap->db_type)) {
2483 rmvq_noenab(q, mp);
2484 freemsg(mp);
2485 flushed = B_TRUE;
2486 }
2487 mp = nmp;
2488 }
2489 mutex_exit(QLOCK(q));
2490
2491 /*
2492 * If any mblk(s) has been freed, we know that qbackenable()
2493 * will need to be called.
2494 */
2495 if (flushed)
micheng116094b2005-07-26 01:08:29 -07002496 qbackenable(q, pri);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002497 }
2498}
2499
2500/*
2501 * Return 1 if the queue is not full. If the queue is full, return
2502 * 0 (may not put message) and set QWANTW flag (caller wants to write
2503 * to the queue).
2504 */
2505int
2506canput(queue_t *q)
2507{
2508 TRACE_1(TR_FAC_STREAMS_FR, TR_CANPUT_IN, "canput:%p", q);
2509
2510 /* this is for loopback transports, they should not do a canput */
2511 ASSERT(STRMATED(q->q_stream) || STREAM(q) == STREAM(q->q_nfsrv));
2512
2513 /* Find next forward module that has a service procedure */
2514 q = q->q_nfsrv;
2515
2516 if (!(q->q_flag & QFULL)) {
2517 TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
2518 return (1);
2519 }
2520 mutex_enter(QLOCK(q));
2521 if (q->q_flag & QFULL) {
2522 q->q_flag |= QWANTW;
2523 mutex_exit(QLOCK(q));
2524 TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 0);
2525 return (0);
2526 }
2527 mutex_exit(QLOCK(q));
2528 TRACE_2(TR_FAC_STREAMS_FR, TR_CANPUT_OUT, "canput:%p %d", q, 1);
2529 return (1);
2530}
2531
2532/*
2533 * This is the new canput for use with priority bands. Return 1 if the
2534 * band is not full. If the band is full, return 0 (may not put message)
2535 * and set QWANTW(QB_WANTW) flag for zero(non-zero) band (caller wants to
2536 * write to the queue).
2537 */
2538int
2539bcanput(queue_t *q, unsigned char pri)
2540{
2541 qband_t *qbp;
2542
2543 TRACE_2(TR_FAC_STREAMS_FR, TR_BCANPUT_IN, "bcanput:%p %p", q, pri);
2544 if (!q)
2545 return (0);
2546
2547 /* Find next forward module that has a service procedure */
2548 q = q->q_nfsrv;
2549
2550 mutex_enter(QLOCK(q));
2551 if (pri == 0) {
2552 if (q->q_flag & QFULL) {
2553 q->q_flag |= QWANTW;
2554 mutex_exit(QLOCK(q));
2555 TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
brutus17169042008-05-23 20:14:10 -07002556 "bcanput:%p %X %d", q, pri, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002557 return (0);
2558 }
2559 } else { /* pri != 0 */
2560 if (pri > q->q_nband) {
2561 /*
2562 * No band exists yet, so return success.
2563 */
2564 mutex_exit(QLOCK(q));
2565 TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
brutus17169042008-05-23 20:14:10 -07002566 "bcanput:%p %X %d", q, pri, 1);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002567 return (1);
2568 }
2569 qbp = q->q_bandp;
2570 while (--pri)
2571 qbp = qbp->qb_next;
2572 if (qbp->qb_flag & QB_FULL) {
2573 qbp->qb_flag |= QB_WANTW;
2574 mutex_exit(QLOCK(q));
2575 TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
brutus17169042008-05-23 20:14:10 -07002576 "bcanput:%p %X %d", q, pri, 0);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002577 return (0);
2578 }
2579 }
2580 mutex_exit(QLOCK(q));
2581 TRACE_3(TR_FAC_STREAMS_FR, TR_BCANPUT_OUT,
brutus17169042008-05-23 20:14:10 -07002582 "bcanput:%p %X %d", q, pri, 1);
stevel@tonic-gate7c478bd2005-06-14 00:00:00 -07002583 return (1);
2584}
2585
2586/*
2587 * Put a message on a queue.
2588 *
2589 * Messages are enqueued on a priority basis. The priority classes
2590 * are HIGH PRIORITY (type >= QPCTL), PRIORITY (type < QPCTL && band > 0),
2591 * and B_NORMAL (type < QPCTL && band == 0).
2592 *
2593 * Add appropriate weighted data block sizes to queue count.
2594 * If queue hits high water mark then set QFULL flag.
2595 *
2596 * If QNOENAB is not set (putq is allowed to enable the queue),
2597 * enable the queue only if the message is PRIORITY,
2598 * or the QWANTR flag is set (indicating that the service procedure
2599 * is ready to read the queue. This implies that a service
2600 * procedure must NEVER put a high priority message back on its own
2601 * queue, as this would result in an infinite loop (!).
2602 */
2603int
2604putq(queue_t *q, mblk_t *bp)
2605{
2606 mblk_t *tmp;
2607 qband_t *qbp = NULL;
2608 int mcls = (int)queclass(bp);
2609 kthread_id_t freezer;
2610 int bytecnt = 0, mblkcnt = 0;
2611
2612 freezer = STREAM(q)->sd_freezer;
2613 if (freezer == curthread) {
2614 ASSERT(frozenstr(q));
2615 ASSERT(MUTEX_HELD(QLOCK(q)));
2616 } else
2617 mutex_enter(QLOCK(q));
2618
2619 /*
2620 * Make sanity checks and if qband structure is not yet
2621 * allocated, do so.
2622 */
2623 if (mcls == QPCTL) {
2624 if (bp->b_band != 0)
2625 bp->b_band = 0; /* force to be correct */
2626 } else if (bp->b_band != 0) {
2627 int i;
2628 qband_t **qbpp;
2629
2630 if (bp->b_band > q->q_nband) {
2631
2632 /*
2633 * The qband structure for this priority band is
2634 * not on the queue yet, so we have to allocate
2635 * one on the fly. It would be wasteful to
2636 * associate the qband structures with every
2637 * queue when the queues are allocated. This is
2638 * because most queues will only need the normal
2639 * band of flow which can be described entirely
2640 * by the queue itself.
2641 */
2642 qbpp = &q->q_bandp;
2643 while (*qbpp)
2644 qbpp = &(*qbpp)->qb_next;
2645 while (bp->b_band > q->q_nband) {
2646 if ((*qbpp = allocband()) == NULL) {
2647 if (freezer != curthread)
2648 mutex_exit(QLOCK(q));
2649 return (0);
2650 }
2651 (*qbpp)->qb_hiwat = q->q_hiwat;
2652 (*qbpp)->qb_lowat = q->q_lowat;
2653 q->q_nband++;
2654 qbpp = &(*qbpp)->qb_next;
2655