blob: a44d389855e9281a8ce9e297d86174c8255f668f [file] [log] [blame]
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Anders Persson3e95bd42010-06-17 17:22:09 -070023 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
Yu Xiangning0f1702c2008-12-11 20:04:13 -080024 */
25
26#include <sys/types.h>
27#include <sys/param.h>
28#include <sys/signal.h>
29#include <sys/cmn_err.h>
30
31#include <sys/stropts.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/sockio.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080035#include <sys/strsubr.h>
36#include <sys/strsun.h>
37#include <sys/atomic.h>
Anders Persson41174432009-02-12 17:35:05 -080038#include <sys/tihdr.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080039
40#include <fs/sockfs/sockcommon.h>
Anders Persson3e95bd42010-06-17 17:22:09 -070041#include <fs/sockfs/sockfilter_impl.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080042#include <fs/sockfs/socktpi.h>
Anders Perssonbbc000e2009-04-28 12:10:59 -070043#include <fs/sockfs/sodirect.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080044#include <sys/ddi.h>
45#include <inet/ip.h>
46#include <sys/time.h>
47#include <sys/cmn_err.h>
48
49#ifdef SOCK_TEST
50extern int do_useracc;
51extern clock_t sock_test_timelimit;
52#endif /* SOCK_TEST */
53
54#define MBLK_PULL_LEN 64
55uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
56
57#ifdef DEBUG
58boolean_t so_debug_length = B_FALSE;
59static boolean_t so_check_length(sonode_t *so);
60#endif
61
Yu Xiangning0f1702c2008-12-11 20:04:13 -080062static int
63so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
64 struct sonode **nsop)
65{
66 struct sonode *nso = NULL;
67
68 *nsop = NULL;
69 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
Anders Persson3e95bd42010-06-17 17:22:09 -070070 while ((nso = list_remove_head(&so->so_acceptq_list)) == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -080071 /*
72 * No need to check so_error here, because it is not
73 * possible for a listening socket to be reset or otherwise
74 * disconnected.
75 *
76 * So now we just need check if it's ok to wait.
77 */
78 if (dontblock)
79 return (EWOULDBLOCK);
80 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
81 return (EINTR);
82
83 if (cv_wait_sig_swap(&so->so_acceptq_cv,
84 &so->so_acceptq_lock) == 0)
85 return (EINTR);
86 }
87
88 ASSERT(nso != NULL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -080089 ASSERT(so->so_acceptq_len > 0);
Anders Persson3e95bd42010-06-17 17:22:09 -070090 so->so_acceptq_len--;
91 nso->so_listener = NULL;
Yu Xiangning0f1702c2008-12-11 20:04:13 -080092
93 *nsop = nso;
94
95 return (0);
96}
97
98/*
99 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
100 *
101 * Pulls a connection off of the accept queue.
102 *
103 * Arguments:
104 * so - listening socket
105 * dontblock - indicate whether it's ok to sleep if there are no
106 * connections on the queue
107 * nsop - Value-return argument
108 *
109 * Return values:
110 * 0 when a connection is successfully dequeued, in which case nsop
111 * is set to point to the new connection. Upon failure a non-zero
112 * value is returned, and the value of nsop is set to NULL.
113 *
114 * Note:
115 * so_acceptq_dequeue() may return prematurly if the socket is falling
116 * back to TPI.
117 */
118int
119so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
120 struct sonode **nsop)
121{
122 int error;
123
124 mutex_enter(&so->so_acceptq_lock);
125 error = so_acceptq_dequeue_locked(so, dontblock, nsop);
126 mutex_exit(&so->so_acceptq_lock);
127
128 return (error);
129}
130
Anders Persson3e95bd42010-06-17 17:22:09 -0700131static void
132so_acceptq_flush_impl(struct sonode *so, list_t *list, boolean_t doclose)
133{
134 struct sonode *nso;
135
136 while ((nso = list_remove_head(list)) != NULL) {
137 nso->so_listener = NULL;
138 if (doclose) {
139 (void) socket_close(nso, 0, CRED());
140 } else {
141 /*
142 * Only used for fallback - not possible when filters
143 * are present.
144 */
145 ASSERT(so->so_filter_active == 0);
146 /*
147 * Since the socket is on the accept queue, there can
148 * only be one reference. We drop the reference and
149 * just blow off the socket.
150 */
151 ASSERT(nso->so_count == 1);
152 nso->so_count--;
153 /* drop the proto ref */
154 VN_RELE(SOTOV(nso));
155 }
156 socket_destroy(nso);
157 }
158}
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800159/*
Anders Persson3e95bd42010-06-17 17:22:09 -0700160 * void so_acceptq_flush(struct sonode *so)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800161 *
162 * Removes all pending connections from a listening socket, and
163 * frees the associated resources.
164 *
165 * Arguments
Anders Persson2320a8c2009-10-21 19:52:57 -0700166 * so - listening socket
167 * doclose - make a close downcall for each socket on the accept queue
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800168 *
169 * Return values:
170 * None.
171 *
172 * Note:
173 * The caller has to ensure that no calls to so_acceptq_enqueue() or
174 * so_acceptq_dequeue() occur while the accept queue is being flushed.
175 * So either the socket needs to be in a state where no operations
176 * would come in, or so_lock needs to be obtained.
177 */
178void
Anders Persson2320a8c2009-10-21 19:52:57 -0700179so_acceptq_flush(struct sonode *so, boolean_t doclose)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800180{
Anders Persson3e95bd42010-06-17 17:22:09 -0700181 so_acceptq_flush_impl(so, &so->so_acceptq_list, doclose);
182 so_acceptq_flush_impl(so, &so->so_acceptq_defer, doclose);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800183
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800184 so->so_acceptq_len = 0;
185}
186
187int
188so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
189 sock_connid_t id)
190{
191 ASSERT(MUTEX_HELD(&so->so_lock));
192
193 /*
194 * The protocol has notified us that a connection attempt is being
195 * made, so before we wait for a notification to arrive we must
196 * clear out any errors associated with earlier connection attempts.
197 */
198 if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
199 so->so_error = 0;
200
201 while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
202 if (nonblock)
203 return (EINPROGRESS);
204
205 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
206 return (EINTR);
207
208 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
209 return (EINTR);
210 }
211
212 if (so->so_error != 0)
213 return (sogeterr(so, B_TRUE));
214 /*
215 * Under normal circumstances, so_error should contain an error
216 * in case the connect failed. However, it is possible for another
217 * thread to come in a consume the error, so generate a sensible
218 * error in that case.
219 */
220 if ((so->so_state & SS_ISCONNECTED) == 0)
221 return (ECONNREFUSED);
222
223 return (0);
224}
225
226/*
227 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
228 * sock_connid_t id)
229 *
230 * Wait until the socket is connected or an error has occured.
231 *
232 * Arguments:
233 * so - socket
234 * nonblock - indicate whether it's ok to sleep if the connection has
235 * not yet been established
236 * gen - generation number that was returned by the protocol
237 * when the operation was started
238 *
239 * Returns:
240 * 0 if the connection attempt was successful, or an error indicating why
241 * the connection attempt failed.
242 */
243int
244so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
245{
246 int error;
247
248 mutex_enter(&so->so_lock);
249 error = so_wait_connected_locked(so, nonblock, id);
250 mutex_exit(&so->so_lock);
251
252 return (error);
253}
254
255int
256so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
257{
258 int error;
259
260 ASSERT(MUTEX_HELD(&so->so_lock));
Anders Persson3e95bd42010-06-17 17:22:09 -0700261 while (SO_SND_FLOWCTRLD(so)) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800262 if (so->so_state & SS_CANTSENDMORE)
263 return (EPIPE);
264 if (dontblock)
265 return (EWOULDBLOCK);
266
267 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
268 return (EINTR);
269
270 if (so->so_sndtimeo == 0) {
271 /*
272 * Zero means disable timeout.
273 */
274 error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
275 } else {
Rafael Vanonid3d50732009-11-13 01:32:32 -0800276 error = cv_reltimedwait_sig(&so->so_snd_cv,
277 &so->so_lock, so->so_sndtimeo, TR_CLOCK_TICK);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800278 }
279 if (error == 0)
280 return (EINTR);
281 else if (error == -1)
shenjian34dfe682009-01-21 10:04:42 +0800282 return (EAGAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800283 }
284 return (0);
285}
286
287/*
288 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
289 *
290 * Wait for the transport to notify us about send buffers becoming
291 * available.
292 */
293int
294so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
295{
296 int error = 0;
297
298 mutex_enter(&so->so_lock);
Anders Persson3e95bd42010-06-17 17:22:09 -0700299 so->so_snd_wakeup = B_TRUE;
300 error = so_snd_wait_qnotfull_locked(so, dontblock);
301 so->so_snd_wakeup = B_FALSE;
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800302 mutex_exit(&so->so_lock);
303
304 return (error);
305}
306
307void
308so_snd_qfull(struct sonode *so)
309{
310 mutex_enter(&so->so_lock);
311 so->so_snd_qfull = B_TRUE;
312 mutex_exit(&so->so_lock);
313}
314
315void
316so_snd_qnotfull(struct sonode *so)
317{
318 mutex_enter(&so->so_lock);
319 so->so_snd_qfull = B_FALSE;
320 /* wake up everyone waiting for buffers */
321 cv_broadcast(&so->so_snd_cv);
322 mutex_exit(&so->so_lock);
323}
324
325/*
326 * Change the process/process group to which SIGIO is sent.
327 */
328int
329socket_chgpgrp(struct sonode *so, pid_t pid)
330{
331 int error;
332
333 ASSERT(MUTEX_HELD(&so->so_lock));
334 if (pid != 0) {
335 /*
336 * Permissions check by sending signal 0.
337 * Note that when kill fails it does a
338 * set_errno causing the system call to fail.
339 */
340 error = kill(pid, 0);
341 if (error != 0) {
342 return (error);
343 }
344 }
345 so->so_pgrp = pid;
346 return (0);
347}
348
349
350/*
351 * Generate a SIGIO, for 'writable' events include siginfo structure,
352 * for read events just send the signal.
353 */
354/*ARGSUSED*/
355static void
356socket_sigproc(proc_t *proc, int event)
357{
358 k_siginfo_t info;
359
360 ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
361
362 if (event & SOCKETSIG_WRITE) {
363 info.si_signo = SIGPOLL;
364 info.si_code = POLL_OUT;
365 info.si_errno = 0;
366 info.si_fd = 0;
367 info.si_band = 0;
368 sigaddq(proc, NULL, &info, KM_NOSLEEP);
369 }
370 if (event & SOCKETSIG_READ) {
371 sigtoproc(proc, NULL, SIGPOLL);
372 }
373 if (event & SOCKETSIG_URG) {
374 sigtoproc(proc, NULL, SIGURG);
375 }
376}
377
378void
379socket_sendsig(struct sonode *so, int event)
380{
381 proc_t *proc;
382
383 ASSERT(MUTEX_HELD(&so->so_lock));
384
385 if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
386 event != SOCKETSIG_URG)) {
387 return;
388 }
389
390 dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
391
392 if (so->so_pgrp > 0) {
393 /*
394 * XXX This unfortunately still generates
395 * a signal when a fd is closed but
396 * the proc is active.
397 */
398 mutex_enter(&pidlock);
399 proc = prfind(so->so_pgrp);
400 if (proc == NULL) {
401 mutex_exit(&pidlock);
402 return;
403 }
404 mutex_enter(&proc->p_lock);
405 mutex_exit(&pidlock);
406 socket_sigproc(proc, event);
407 mutex_exit(&proc->p_lock);
408 } else {
409 /*
410 * Send to process group. Hold pidlock across
411 * calls to socket_sigproc().
412 */
413 pid_t pgrp = -so->so_pgrp;
414
415 mutex_enter(&pidlock);
416 proc = pgfind(pgrp);
417 while (proc != NULL) {
418 mutex_enter(&proc->p_lock);
419 socket_sigproc(proc, event);
420 mutex_exit(&proc->p_lock);
421 proc = proc->p_pglink;
422 }
423 mutex_exit(&pidlock);
424 }
425}
426
427#define MIN(a, b) ((a) < (b) ? (a) : (b))
428/* Copy userdata into a new mblk_t */
429mblk_t *
430socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800431 size_t tail_len, int *errorp)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800432{
433 mblk_t *head = NULL, **tail = &head;
434
435 ASSERT(iosize == INFPSZ || iosize > 0);
436
437 if (iosize == INFPSZ || iosize > uiop->uio_resid)
438 iosize = uiop->uio_resid;
439
440 if (maxblk == INFPSZ)
441 maxblk = iosize;
442
443 /* Nothing to do in these cases, so we're done */
444 if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
445 goto done;
446
447 /*
448 * We will enter the loop below if iosize is 0; it will allocate an
449 * empty message block and call uiomove(9F) which will just return.
450 * We could avoid that with an extra check but would only slow
451 * down the much more likely case where iosize is larger than 0.
452 */
453 do {
454 ssize_t blocksize;
455 mblk_t *mp;
456
457 blocksize = MIN(iosize, maxblk);
458 ASSERT(blocksize >= 0);
Erik Nordmarkbd670b32009-11-11 11:49:49 -0800459 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800460 if (mp == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800461 *errorp = ENOMEM;
462 return (head);
463 }
464 mp->b_rptr += wroff;
465 mp->b_wptr = mp->b_rptr + blocksize;
466
467 *tail = mp;
468 tail = &mp->b_cont;
469
470 /* uiomove(9F) either returns 0 or EFAULT */
471 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
472 UIO_WRITE, uiop)) != 0) {
473 ASSERT(*errorp != ENOMEM);
474 freemsg(head);
475 return (NULL);
476 }
477
478 iosize -= blocksize;
479 } while (iosize > 0);
480
481done:
482 *errorp = 0;
483 return (head);
484}
485
486mblk_t *
487socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
488{
489 int error;
490 ptrdiff_t n;
491 mblk_t *nmp;
492
493 ASSERT(mp->b_wptr >= mp->b_rptr);
494
495 /*
496 * max_read is the offset of the oobmark and read can not go pass
497 * the oobmark.
498 */
499 if (max_read == INFPSZ || max_read > uiop->uio_resid)
500 max_read = uiop->uio_resid;
501
502 do {
503 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
504 ASSERT(n > 0);
505
506 error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
507 if (error != 0) {
508 freemsg(mp);
509 *errorp = error;
510 return (NULL);
511 }
512 }
513
514 mp->b_rptr += n;
515 max_read -= n;
516 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
517 /*
518 * get rid of zero length mblks
519 */
520 nmp = mp;
521 mp = mp->b_cont;
522 freeb(nmp);
523 }
524 } while (mp != NULL && max_read > 0);
525
526 *errorp = 0;
527 return (mp);
528}
529
530static void
531so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
532{
533 ASSERT(last_tail != NULL);
534 mp->b_next = so->so_rcv_q_head;
535 mp->b_prev = last_tail;
536 ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
537
538 if (so->so_rcv_q_head == NULL) {
539 ASSERT(so->so_rcv_q_last_head == NULL);
540 so->so_rcv_q_last_head = mp;
541#ifdef DEBUG
542 } else {
543 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
544#endif
545 }
546 so->so_rcv_q_head = mp;
547
548#ifdef DEBUG
549 if (so_debug_length) {
550 mutex_enter(&so->so_lock);
551 ASSERT(so_check_length(so));
552 mutex_exit(&so->so_lock);
553 }
554#endif
555}
556
Anders Perssone4b767e2009-03-26 17:08:33 -0700557/*
558 * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
559 * can be processed by so_dequeue_msg().
560 */
561void
562so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800563{
Anders Persson3e95bd42010-06-17 17:22:09 -0700564 if (so->so_filter_active > 0 &&
565 (mp_head = sof_filter_data_in_proc(so, mp_head,
566 &mp_last_head)) == NULL)
567 return;
568
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800569 ASSERT(mp_head->b_prev != NULL);
Anders Persson3e95bd42010-06-17 17:22:09 -0700570 if (so->so_rcv_q_head == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800571 so->so_rcv_q_head = mp_head;
572 so->so_rcv_q_last_head = mp_last_head;
573 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
574 } else {
575 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
576 (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
577
578 if (mp_head->b_next == NULL &&
579 DB_TYPE(mp_head) == M_DATA &&
580 DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
581 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
582 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
583 mp_head->b_prev = NULL;
584 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
585 /*
586 * Append to last_head if more than one mblks, and both
587 * mp_head and last_head are I/OAT mblks.
588 */
589 ASSERT(mp_head->b_next != NULL);
590 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
591 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
592 mp_head->b_prev = NULL;
593
594 so->so_rcv_q_last_head->b_next = mp_head->b_next;
595 mp_head->b_next = NULL;
596 so->so_rcv_q_last_head = mp_last_head;
597 } else {
598#ifdef DEBUG
599 {
600 mblk_t *tmp_mblk;
601 tmp_mblk = mp_head;
602 while (tmp_mblk != NULL) {
603 ASSERT(tmp_mblk->b_prev != NULL);
604 tmp_mblk = tmp_mblk->b_next;
605 }
606 }
607#endif
608 so->so_rcv_q_last_head->b_next = mp_head;
609 so->so_rcv_q_last_head = mp_last_head;
610 }
611 }
612}
613
Rao Shoaib5795faa2009-07-28 13:53:49 -0700614/*
615 * Check flow control on a given sonode. Must have so_lock held, and
616 * this function will release the hold.
617 */
Anders Persson3e95bd42010-06-17 17:22:09 -0700618void
Rao Shoaib5795faa2009-07-28 13:53:49 -0700619so_check_flow_control(struct sonode *so)
620{
621 ASSERT(MUTEX_HELD(&so->so_lock));
622
Anders Persson3e95bd42010-06-17 17:22:09 -0700623 if (so->so_flowctrld && (so->so_rcv_queued < so->so_rcvlowat &&
624 !(so->so_state & SS_FIL_RCV_FLOWCTRL))) {
Rao Shoaib5795faa2009-07-28 13:53:49 -0700625 so->so_flowctrld = B_FALSE;
626 mutex_exit(&so->so_lock);
627 /*
628 * Open up flow control. SCTP does not have any downcalls, and
629 * it will clr flow ctrl in sosctp_recvmsg().
630 */
631 if (so->so_downcalls != NULL &&
632 so->so_downcalls->sd_clr_flowctrl != NULL) {
633 (*so->so_downcalls->sd_clr_flowctrl)
634 (so->so_proto_handle);
635 }
Anders Persson3e95bd42010-06-17 17:22:09 -0700636 /* filters can start injecting data */
637 sof_sonode_notify_filters(so, SOF_EV_INJECT_DATA_IN_OK, 0);
Rao Shoaib5795faa2009-07-28 13:53:49 -0700638 } else {
639 mutex_exit(&so->so_lock);
640 }
641}
642
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800643int
644so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
645 rval_t *rvalp, int flags)
646{
647 mblk_t *mp, *nmp;
648 mblk_t *savemp, *savemptail;
649 mblk_t *new_msg_head;
650 mblk_t *new_msg_last_head;
651 mblk_t *last_tail;
652 boolean_t partial_read;
653 boolean_t reset_atmark = B_FALSE;
654 int more = 0;
655 int error;
656 ssize_t oobmark;
657 sodirect_t *sodp = so->so_direct;
658
659 partial_read = B_FALSE;
660 *mctlp = NULL;
661again:
662 mutex_enter(&so->so_lock);
663again1:
664#ifdef DEBUG
665 if (so_debug_length) {
666 ASSERT(so_check_length(so));
667 }
668#endif
Anders Persson8591a192009-05-29 09:33:18 -0700669 if (so->so_state & SS_RCVATMARK) {
670 /* Check whether the caller is OK to read past the mark */
671 if (flags & MSG_NOMARK) {
672 mutex_exit(&so->so_lock);
673 return (EWOULDBLOCK);
674 }
675 reset_atmark = B_TRUE;
676 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800677 /*
678 * First move messages from the dump area to processing area
679 */
680 if (sodp != NULL) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700681 if (sodp->sod_enabled) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800682 if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
683 /* nothing to uioamove */
684 sodp = NULL;
685 } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
686 sodp->sod_uioa.uioa_state &= UIOA_CLR;
687 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
688 /*
689 * try to uioamove() the data that
690 * has already queued.
691 */
692 sod_uioa_so_init(so, sodp, uiop);
693 }
694 } else {
695 sodp = NULL;
696 }
697 }
698 new_msg_head = so->so_rcv_head;
699 new_msg_last_head = so->so_rcv_last_head;
700 so->so_rcv_head = NULL;
701 so->so_rcv_last_head = NULL;
702 oobmark = so->so_oobmark;
703 /*
704 * We can release the lock as there can only be one reader
705 */
706 mutex_exit(&so->so_lock);
707
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800708 if (new_msg_head != NULL) {
Anders Perssone4b767e2009-03-26 17:08:33 -0700709 so_process_new_message(so, new_msg_head, new_msg_last_head);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800710 }
711 savemp = savemptail = NULL;
712 rvalp->r_val1 = 0;
713 error = 0;
714 mp = so->so_rcv_q_head;
715
716 if (mp != NULL &&
717 (so->so_rcv_timer_tid == 0 ||
718 so->so_rcv_queued >= so->so_rcv_thresh)) {
719 partial_read = B_FALSE;
720
721 if (flags & MSG_PEEK) {
722 if ((nmp = dupmsg(mp)) == NULL &&
723 (nmp = copymsg(mp)) == NULL) {
724 size_t size = msgsize(mp);
725
726 error = strwaitbuf(size, BPRI_HI);
727 if (error) {
728 return (error);
729 }
730 goto again;
731 }
732 mp = nmp;
733 } else {
734 ASSERT(mp->b_prev != NULL);
735 last_tail = mp->b_prev;
736 mp->b_prev = NULL;
737 so->so_rcv_q_head = mp->b_next;
738 if (so->so_rcv_q_head == NULL) {
739 so->so_rcv_q_last_head = NULL;
740 }
741 mp->b_next = NULL;
742 }
743
744 ASSERT(mctlp != NULL);
745 /*
746 * First process PROTO or PCPROTO blocks, if any.
747 */
748 if (DB_TYPE(mp) != M_DATA) {
749 *mctlp = mp;
750 savemp = mp;
751 savemptail = mp;
752 ASSERT(DB_TYPE(mp) == M_PROTO ||
753 DB_TYPE(mp) == M_PCPROTO);
754 while (mp->b_cont != NULL &&
755 DB_TYPE(mp->b_cont) != M_DATA) {
756 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
757 DB_TYPE(mp->b_cont) == M_PCPROTO);
758 mp = mp->b_cont;
759 savemptail = mp;
760 }
761 mp = savemptail->b_cont;
762 savemptail->b_cont = NULL;
763 }
764
765 ASSERT(DB_TYPE(mp) == M_DATA);
766 /*
767 * Now process DATA blocks, if any. Note that for sodirect
768 * enabled socket, uio_resid can be 0.
769 */
770 if (uiop->uio_resid >= 0) {
771 ssize_t copied = 0;
772
773 if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700774 mutex_enter(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800775 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
776 copied = sod_uioa_mblk(so, mp);
777 if (copied > 0)
778 partial_read = B_TRUE;
Anders Perssonbbc000e2009-04-28 12:10:59 -0700779 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800780 /* mark this mblk as processed */
781 mp = NULL;
782 } else {
783 ssize_t oldresid = uiop->uio_resid;
784
785 if (MBLKL(mp) < so_mblk_pull_len) {
786 if (pullupmsg(mp, -1) == 1) {
787 last_tail = mp;
788 }
789 }
790 /*
791 * Can not read beyond the oobmark
792 */
793 mp = socopyoutuio(mp, uiop,
794 oobmark == 0 ? INFPSZ : oobmark, &error);
795 if (error != 0) {
796 freemsg(*mctlp);
797 *mctlp = NULL;
798 more = 0;
799 goto done;
800 }
801 ASSERT(oldresid >= uiop->uio_resid);
802 copied = oldresid - uiop->uio_resid;
803 if (oldresid > uiop->uio_resid)
804 partial_read = B_TRUE;
805 }
806 ASSERT(copied >= 0);
807 if (copied > 0 && !(flags & MSG_PEEK)) {
808 mutex_enter(&so->so_lock);
809 so->so_rcv_queued -= copied;
810 ASSERT(so->so_oobmark >= 0);
811 if (so->so_oobmark > 0) {
812 so->so_oobmark -= copied;
813 ASSERT(so->so_oobmark >= 0);
814 if (so->so_oobmark == 0) {
815 ASSERT(so->so_state &
816 SS_OOBPEND);
817 so->so_oobmark = 0;
818 so->so_state |= SS_RCVATMARK;
819 }
820 }
Rao Shoaib5795faa2009-07-28 13:53:49 -0700821 /*
822 * so_check_flow_control() will drop
823 * so->so_lock.
824 */
825 so_check_flow_control(so);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800826 }
827 }
828 if (mp != NULL) { /* more data blocks in msg */
829 more |= MOREDATA;
830 if ((flags & (MSG_PEEK|MSG_TRUNC))) {
Rao Shoaib5795faa2009-07-28 13:53:49 -0700831 if (flags & MSG_PEEK) {
832 freemsg(mp);
833 } else {
834 unsigned int msize = msgdsize(mp);
835
836 freemsg(mp);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800837 mutex_enter(&so->so_lock);
Rao Shoaib5795faa2009-07-28 13:53:49 -0700838 so->so_rcv_queued -= msize;
839 /*
840 * so_check_flow_control() will drop
841 * so->so_lock.
842 */
843 so_check_flow_control(so);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800844 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800845 } else if (partial_read && !somsghasdata(mp)) {
846 /*
847 * Avoid queuing a zero-length tail part of
848 * a message. partial_read == 1 indicates that
849 * we read some of the message.
850 */
851 freemsg(mp);
852 more &= ~MOREDATA;
853 } else {
854 if (savemp != NULL &&
855 (flags & MSG_DUPCTRL)) {
856 mblk_t *nmp;
857 /*
858 * There should only be non data mblks
859 */
860 ASSERT(DB_TYPE(savemp) != M_DATA &&
861 DB_TYPE(savemptail) != M_DATA);
862try_again:
863 if ((nmp = dupmsg(savemp)) == NULL &&
864 (nmp = copymsg(savemp)) == NULL) {
865
866 size_t size = msgsize(savemp);
867
868 error = strwaitbuf(size,
869 BPRI_HI);
870 if (error != 0) {
871 /*
872 * In case we
873 * cannot copy
874 * control data
875 * free the remaining
876 * data.
877 */
878 freemsg(mp);
879 goto done;
880 }
881 goto try_again;
882 }
883
884 ASSERT(nmp != NULL);
885 ASSERT(DB_TYPE(nmp) != M_DATA);
886 savemptail->b_cont = mp;
887 *mctlp = nmp;
888 mp = savemp;
889 }
890 /*
891 * putback mp
892 */
893 so_prepend_msg(so, mp, last_tail);
894 }
895 }
896
897 /* fast check so_rcv_head if there is more data */
898 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
899 *mctlp == NULL && uiop->uio_resid > 0 &&
900 !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
901 goto again;
902 }
903 } else if (!partial_read) {
904 mutex_enter(&so->so_lock);
905 if (so->so_error != 0) {
906 error = sogeterr(so, !(flags & MSG_PEEK));
907 mutex_exit(&so->so_lock);
908 return (error);
909 }
910 /*
911 * No pending data. Return right away for nonblocking
912 * socket, otherwise sleep waiting for data.
913 */
Mike Cheng2caa6592008-12-29 14:01:03 +0800914 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800915 if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
916 (flags & MSG_DONTWAIT)) {
917 error = EWOULDBLOCK;
918 } else {
919 if (so->so_state & (SS_CLOSING |
920 SS_FALLBACK_PENDING)) {
921 mutex_exit(&so->so_lock);
922 error = EINTR;
923 goto done;
924 }
925
926 if (so->so_rcv_head != NULL) {
927 goto again1;
928 }
929 so->so_rcv_wakeup = B_TRUE;
930 so->so_rcv_wanted = uiop->uio_resid;
931 if (so->so_rcvtimeo == 0) {
932 /*
933 * Zero means disable timeout.
934 */
935 error = cv_wait_sig(&so->so_rcv_cv,
936 &so->so_lock);
937 } else {
Rafael Vanonid3d50732009-11-13 01:32:32 -0800938 error = cv_reltimedwait_sig(
939 &so->so_rcv_cv, &so->so_lock,
940 so->so_rcvtimeo, TR_CLOCK_TICK);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800941 }
942 so->so_rcv_wakeup = B_FALSE;
943 so->so_rcv_wanted = 0;
944
945 if (error == 0) {
946 error = EINTR;
947 } else if (error == -1) {
shenjian34dfe682009-01-21 10:04:42 +0800948 error = EAGAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800949 } else {
950 goto again1;
951 }
952 }
953 }
954 mutex_exit(&so->so_lock);
955 }
956 if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
957 /*
958 * We are passed the mark, update state
959 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
960 * The draft Posix socket spec states that the mark should
961 * not be cleared when peeking. We follow the latter.
962 */
963 mutex_enter(&so->so_lock);
964 ASSERT(so_verify_oobstate(so));
965 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
966 freemsg(so->so_oobmsg);
967 so->so_oobmsg = NULL;
968 ASSERT(so_verify_oobstate(so));
969 mutex_exit(&so->so_lock);
970 }
971 ASSERT(so->so_rcv_wakeup == B_FALSE);
972done:
973 if (sodp != NULL) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700974 mutex_enter(&so->so_lock);
975 if (sodp->sod_enabled &&
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800976 (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
977 SOD_UIOAFINI(sodp);
978 if (sodp->sod_uioa.uioa_mbytes > 0) {
979 ASSERT(so->so_rcv_q_head != NULL ||
980 so->so_rcv_head != NULL);
981 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
982 if (error == EWOULDBLOCK)
983 error = 0;
984 }
985 }
Anders Perssonbbc000e2009-04-28 12:10:59 -0700986 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800987 }
988#ifdef DEBUG
989 if (so_debug_length) {
990 mutex_enter(&so->so_lock);
991 ASSERT(so_check_length(so));
992 mutex_exit(&so->so_lock);
993 }
994#endif
995 rvalp->r_val1 = more;
Rao Shoaib5795faa2009-07-28 13:53:49 -0700996 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800997 return (error);
998}
999
Anders Perssone4b767e2009-03-26 17:08:33 -07001000/*
1001 * Enqueue data from the protocol on the socket's rcv queue.
1002 *
1003 * We try to hook new M_DATA mblks onto an existing chain, however,
1004 * that cannot be done if the existing chain has already been
1005 * processed by I/OAT. Non-M_DATA mblks are just linked together via
1006 * b_next. In all cases the b_prev of the enqueued mblk is set to
1007 * point to the last mblk in its b_cont chain.
1008 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001009void
1010so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1011{
1012 ASSERT(MUTEX_HELD(&so->so_lock));
1013
1014#ifdef DEBUG
1015 if (so_debug_length) {
1016 ASSERT(so_check_length(so));
1017 }
1018#endif
1019 so->so_rcv_queued += msg_size;
1020
1021 if (so->so_rcv_head == NULL) {
1022 ASSERT(so->so_rcv_last_head == NULL);
1023 so->so_rcv_head = mp;
1024 so->so_rcv_last_head = mp;
1025 } else if ((DB_TYPE(mp) == M_DATA &&
1026 DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1027 ((DB_FLAGS(mp) & DBLK_UIOA) ==
1028 (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1029 /* Added to the end */
1030 ASSERT(so->so_rcv_last_head != NULL);
1031 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1032 so->so_rcv_last_head->b_prev->b_cont = mp;
1033 } else {
1034 /* Start a new end */
1035 so->so_rcv_last_head->b_next = mp;
1036 so->so_rcv_last_head = mp;
1037 }
1038 while (mp->b_cont != NULL)
1039 mp = mp->b_cont;
1040
1041 so->so_rcv_last_head->b_prev = mp;
1042#ifdef DEBUG
1043 if (so_debug_length) {
1044 ASSERT(so_check_length(so));
1045 }
1046#endif
1047}
1048
1049/*
1050 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1051 */
1052boolean_t
1053somsghasdata(mblk_t *mp)
1054{
1055 for (; mp; mp = mp->b_cont)
1056 if (mp->b_datap->db_type == M_DATA) {
1057 ASSERT(mp->b_wptr >= mp->b_rptr);
1058 if (mp->b_wptr > mp->b_rptr)
1059 return (B_TRUE);
1060 }
1061 return (B_FALSE);
1062}
1063
1064/*
1065 * Flush the read side of sockfs.
1066 *
1067 * The caller must be sure that a reader is not already active when the
1068 * buffer is being flushed.
1069 */
1070void
1071so_rcv_flush(struct sonode *so)
1072{
1073 mblk_t *mp;
1074
1075 ASSERT(MUTEX_HELD(&so->so_lock));
1076
1077 if (so->so_oobmsg != NULL) {
1078 freemsg(so->so_oobmsg);
1079 so->so_oobmsg = NULL;
1080 so->so_oobmark = 0;
1081 so->so_state &=
1082 ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1083 }
1084
1085 /*
Anders Persson3e95bd42010-06-17 17:22:09 -07001086 * Free messages sitting in the recv queues
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001087 */
1088 while (so->so_rcv_q_head != NULL) {
1089 mp = so->so_rcv_q_head;
1090 so->so_rcv_q_head = mp->b_next;
1091 mp->b_next = mp->b_prev = NULL;
1092 freemsg(mp);
1093 }
1094 while (so->so_rcv_head != NULL) {
1095 mp = so->so_rcv_head;
1096 so->so_rcv_head = mp->b_next;
1097 mp->b_next = mp->b_prev = NULL;
1098 freemsg(mp);
1099 }
1100 so->so_rcv_queued = 0;
1101 so->so_rcv_q_head = NULL;
1102 so->so_rcv_q_last_head = NULL;
1103 so->so_rcv_head = NULL;
1104 so->so_rcv_last_head = NULL;
1105}
1106
1107/*
1108 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1109 */
1110int
1111sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1112 boolean_t oob_inline)
1113{
1114 mblk_t *mp, *nmp;
1115 int error;
1116
1117 dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1118 flags));
1119
1120 if (msg != NULL) {
1121 /*
1122 * There is never any oob data with addresses or control since
1123 * the T_EXDATA_IND does not carry any options.
1124 */
1125 msg->msg_controllen = 0;
1126 msg->msg_namelen = 0;
1127 msg->msg_flags = 0;
1128 }
1129
1130 mutex_enter(&so->so_lock);
1131 ASSERT(so_verify_oobstate(so));
1132 if (oob_inline ||
1133 (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1134 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1135 mutex_exit(&so->so_lock);
1136 return (EINVAL);
1137 }
1138 if (!(so->so_state & SS_HAVEOOBDATA)) {
1139 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1140 mutex_exit(&so->so_lock);
1141 return (EWOULDBLOCK);
1142 }
1143 ASSERT(so->so_oobmsg != NULL);
1144 mp = so->so_oobmsg;
1145 if (flags & MSG_PEEK) {
1146 /*
1147 * Since recv* can not return ENOBUFS we can not use dupmsg.
1148 * Instead we revert to the consolidation private
1149 * allocb_wait plus bcopy.
1150 */
1151 mblk_t *mp1;
1152
1153 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1154 ASSERT(mp1);
1155
1156 while (mp != NULL) {
1157 ssize_t size;
1158
1159 size = MBLKL(mp);
1160 bcopy(mp->b_rptr, mp1->b_wptr, size);
1161 mp1->b_wptr += size;
1162 ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1163 mp = mp->b_cont;
1164 }
1165 mp = mp1;
1166 } else {
1167 /*
1168 * Update the state indicating that the data has been consumed.
1169 * Keep SS_OOBPEND set until data is consumed past the mark.
1170 */
1171 so->so_oobmsg = NULL;
1172 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1173 }
1174 ASSERT(so_verify_oobstate(so));
1175 mutex_exit(&so->so_lock);
1176
1177 error = 0;
1178 nmp = mp;
1179 while (nmp != NULL && uiop->uio_resid > 0) {
1180 ssize_t n = MBLKL(nmp);
1181
1182 n = MIN(n, uiop->uio_resid);
1183 if (n > 0)
1184 error = uiomove(nmp->b_rptr, n,
1185 UIO_READ, uiop);
1186 if (error)
1187 break;
1188 nmp = nmp->b_cont;
1189 }
1190 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1191 freemsg(mp);
1192 return (error);
1193}
1194
1195/*
1196 * Allocate and initializ sonode
1197 */
1198/* ARGSUSED */
1199struct sonode *
1200socket_sonode_create(struct sockparams *sp, int family, int type,
1201 int protocol, int version, int sflags, int *errorp, struct cred *cr)
1202{
1203 sonode_t *so;
1204 int kmflags;
1205
1206 /*
1207 * Choose the right set of sonodeops based on the upcall and
1208 * down call version that the protocol has provided
1209 */
1210 if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1211 SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1212 /*
1213 * mismatch
1214 */
1215#ifdef DEBUG
1216 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1217#endif
1218 *errorp = EINVAL;
1219 return (NULL);
1220 }
1221
1222 kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1223
1224 so = kmem_cache_alloc(socket_cache, kmflags);
1225 if (so == NULL) {
1226 *errorp = ENOMEM;
1227 return (NULL);
1228 }
1229
1230 sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1231
1232 if (version == SOV_DEFAULT)
1233 version = so_default_version;
1234
1235 so->so_version = (short)version;
1236
1237 /*
1238 * set the default values to be INFPSZ
1239 * if a protocol desires it can change the value later
1240 */
1241 so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1242 so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1243 so->so_proto_props.sopp_maxpsz = INFPSZ;
1244 so->so_proto_props.sopp_maxblk = INFPSZ;
1245
1246 return (so);
1247}
1248
1249int
1250socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1251{
1252 int error = 0;
1253
1254 if (pso != NULL) {
1255 /*
1256 * We have a passive open, so inherit basic state from
1257 * the parent (listener).
1258 *
1259 * No need to grab the new sonode's lock, since there is no
1260 * one that can have a reference to it.
1261 */
1262 mutex_enter(&pso->so_lock);
1263
1264 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1265 so->so_pgrp = pso->so_pgrp;
1266 so->so_rcvtimeo = pso->so_rcvtimeo;
1267 so->so_sndtimeo = pso->so_sndtimeo;
Yu Xiangninga5adac42008-12-29 13:56:29 +08001268 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001269 /*
1270 * Make note of the socket level options. TCP and IP level
1271 * options are already inherited. We could do all this after
1272 * accept is successful but doing it here simplifies code and
1273 * no harm done for error case.
1274 */
1275 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
Yu Xiangninga5adac42008-12-29 13:56:29 +08001276 SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001277 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1278 so->so_proto_props = pso->so_proto_props;
1279 so->so_mode = pso->so_mode;
andersf0267582008-12-20 22:46:32 -08001280 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001281
1282 mutex_exit(&pso->so_lock);
Anders Persson3e95bd42010-06-17 17:22:09 -07001283
1284 /*
1285 * If the parent has any filters, try to inherit them.
1286 */
1287 if (pso->so_filter_active > 0 &&
1288 (error = sof_sonode_inherit_filters(so, pso)) != 0)
1289 return (error);
1290
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001291 } else {
1292 struct sockparams *sp = so->so_sockparams;
1293 sock_upcalls_t *upcalls_to_use;
1294
1295 /*
Anders Persson3e95bd42010-06-17 17:22:09 -07001296 * Attach automatic filters, if there are any.
1297 */
1298 if (!list_is_empty(&sp->sp_auto_filters) &&
1299 (error = sof_sonode_autoattach_filters(so, cr)) != 0)
1300 return (error);
1301
1302 /* OK to attach filters */
1303 so->so_state |= SS_FILOP_OK;
1304
1305 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001306 * Based on the version number select the right upcalls to
1307 * pass down. Currently we only have one version so choose
1308 * default
1309 */
1310 upcalls_to_use = &so_upcalls;
1311
1312 /* active open, so create a lower handle */
1313 so->so_proto_handle =
1314 sp->sp_smod_info->smod_proto_create_func(so->so_family,
1315 so->so_type, so->so_protocol, &so->so_downcalls,
1316 &so->so_mode, &error, flags, cr);
1317
1318 if (so->so_proto_handle == NULL) {
1319 ASSERT(error != 0);
1320 /*
1321 * To be safe; if a lower handle cannot be created, and
1322 * the proto does not give a reason why, assume there
1323 * was a lack of memory.
1324 */
1325 return ((error == 0) ? ENOMEM : error);
1326 }
1327 ASSERT(so->so_downcalls != NULL);
1328 ASSERT(so->so_downcalls->sd_send != NULL ||
1329 so->so_downcalls->sd_send_uio != NULL);
1330 if (so->so_downcalls->sd_recv_uio != NULL) {
1331 ASSERT(so->so_downcalls->sd_poll != NULL);
1332 so->so_pollev |= SO_POLLEV_ALWAYS;
1333 }
1334
1335 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1336 (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1337
1338 /* Wildcard */
1339
1340 /*
1341 * FIXME No need for this, the protocol can deal with it in
1342 * sd_create(). Should update ICMP.
1343 */
1344 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1345 int protocol = so->so_protocol;
1346 int error;
1347 /*
1348 * Issue SO_PROTOTYPE setsockopt.
1349 */
1350 error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1351 &protocol, (t_uscalar_t)sizeof (protocol), cr);
1352 if (error) {
1353 (void) (*so->so_downcalls->sd_close)
1354 (so->so_proto_handle, 0, cr);
1355
1356 mutex_enter(&so->so_lock);
1357 so_rcv_flush(so);
1358 mutex_exit(&so->so_lock);
1359 /*
1360 * Setsockopt often fails with ENOPROTOOPT but
1361 * socket() should fail with
1362 * EPROTONOSUPPORT/EPROTOTYPE.
1363 */
1364 return (EPROTONOSUPPORT);
1365 }
1366 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001367 }
Anders Perssonbbc000e2009-04-28 12:10:59 -07001368
1369 if (uioasync.enabled)
1370 sod_sock_init(so);
1371
Anders Persson3e95bd42010-06-17 17:22:09 -07001372 /* put an extra reference on the socket for the protocol */
1373 VN_HOLD(SOTOV(so));
1374
Anders Perssonbbc000e2009-04-28 12:10:59 -07001375 return (0);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001376}
1377
1378/*
1379 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1380 * struct cred *cr, int32_t *rvalp)
1381 *
1382 * Handle ioctls that manipulate basic socket state; non-blocking,
1383 * async, etc.
1384 *
1385 * Returns:
1386 * < 0 - ioctl was not handle
1387 * >= 0 - ioctl was handled, if > 0, then it is an errno
1388 *
1389 * Notes:
1390 * Assumes the standard receive buffer is used to obtain info for
1391 * NREAD.
1392 */
1393/* ARGSUSED */
1394int
1395socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1396 struct cred *cr, int32_t *rvalp)
1397{
1398 switch (cmd) {
Rao Shoaibbfcb55b2009-01-05 10:51:43 -08001399 case SIOCSQPTR:
1400 /*
1401 * SIOCSQPTR is valid only when helper stream is created
1402 * by the protocol.
1403 */
1404
1405 return (EOPNOTSUPP);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001406 case FIONBIO: {
1407 int32_t value;
1408
1409 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1410 (mode & (int)FKIOCTL)))
1411 return (EFAULT);
1412
1413 mutex_enter(&so->so_lock);
1414 if (value) {
1415 so->so_state |= SS_NDELAY;
1416 } else {
1417 so->so_state &= ~SS_NDELAY;
1418 }
1419 mutex_exit(&so->so_lock);
1420 return (0);
1421 }
1422 case FIOASYNC: {
1423 int32_t value;
1424
1425 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1426 (mode & (int)FKIOCTL)))
1427 return (EFAULT);
1428
1429 mutex_enter(&so->so_lock);
1430
1431 if (value) {
1432 /* Turn on SIGIO */
1433 so->so_state |= SS_ASYNC;
1434 } else {
1435 /* Turn off SIGIO */
1436 so->so_state &= ~SS_ASYNC;
1437 }
1438 mutex_exit(&so->so_lock);
1439
1440 return (0);
1441 }
1442
1443 case SIOCSPGRP:
1444 case FIOSETOWN: {
1445 int error;
1446 pid_t pid;
1447
1448 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1449 (mode & (int)FKIOCTL)))
1450 return (EFAULT);
1451
1452 mutex_enter(&so->so_lock);
1453 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1454 mutex_exit(&so->so_lock);
1455 return (error);
1456 }
1457 case SIOCGPGRP:
1458 case FIOGETOWN:
1459 if (so_copyout(&so->so_pgrp, (void *)arg,
1460 sizeof (pid_t), (mode & (int)FKIOCTL)))
1461 return (EFAULT);
1462
1463 return (0);
1464 case SIOCATMARK: {
1465 int retval;
1466
1467 /*
1468 * Only protocols that support urgent data can handle ATMARK.
1469 */
1470 if ((so->so_mode & SM_EXDATA) == 0)
1471 return (EINVAL);
1472
1473 /*
1474 * If the protocol is maintaining its own buffer, then the
1475 * request must be passed down.
1476 */
1477 if (so->so_downcalls->sd_recv_uio != NULL)
1478 return (-1);
1479
1480 retval = (so->so_state & SS_RCVATMARK) != 0;
1481
1482 if (so_copyout(&retval, (void *)arg, sizeof (int),
1483 (mode & (int)FKIOCTL))) {
1484 return (EFAULT);
1485 }
1486 return (0);
1487 }
1488
1489 case FIONREAD: {
1490 int retval;
1491
1492 /*
1493 * If the protocol is maintaining its own buffer, then the
1494 * request must be passed down.
1495 */
1496 if (so->so_downcalls->sd_recv_uio != NULL)
1497 return (-1);
1498
1499 retval = MIN(so->so_rcv_queued, INT_MAX);
1500
1501 if (so_copyout(&retval, (void *)arg,
1502 sizeof (retval), (mode & (int)FKIOCTL))) {
1503 return (EFAULT);
1504 }
1505 return (0);
1506 }
1507
1508 case _I_GETPEERCRED: {
1509 int error = 0;
1510
1511 if ((mode & FKIOCTL) == 0)
1512 return (EINVAL);
1513
1514 mutex_enter(&so->so_lock);
1515 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1516 error = ENOTSUP;
1517 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1518 error = ENOTCONN;
1519 } else if (so->so_peercred != NULL) {
1520 k_peercred_t *kp = (k_peercred_t *)arg;
1521 kp->pc_cr = so->so_peercred;
1522 kp->pc_cpid = so->so_cpid;
1523 crhold(so->so_peercred);
1524 } else {
1525 error = EINVAL;
1526 }
1527 mutex_exit(&so->so_lock);
1528 return (error);
1529 }
1530 default:
1531 return (-1);
1532 }
1533}
1534
1535/*
Anders Persson41174432009-02-12 17:35:05 -08001536 * Handle the I_NREAD STREAM ioctl.
1537 */
1538static int
1539so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1540{
1541 size_t size = 0;
1542 int retval;
1543 int count = 0;
1544 mblk_t *mp;
Rafael Vanonid3d50732009-11-13 01:32:32 -08001545 clock_t wakeup = drv_usectohz(10);
Anders Persson41174432009-02-12 17:35:05 -08001546
1547 if (so->so_downcalls == NULL ||
1548 so->so_downcalls->sd_recv_uio != NULL)
1549 return (EINVAL);
1550
1551 mutex_enter(&so->so_lock);
1552 /* Wait for reader to get out of the way. */
1553 while (so->so_flag & SOREADLOCKED) {
1554 /*
1555 * If reader is waiting for data, then there should be nothing
1556 * on the rcv queue.
1557 */
1558 if (so->so_rcv_wakeup)
1559 goto out;
1560
Anders Persson41174432009-02-12 17:35:05 -08001561 /* Do a timed sleep, in case the reader goes to sleep. */
Anders Perssondecd6cc2010-01-15 20:20:59 -08001562 (void) cv_reltimedwait(&so->so_read_cv, &so->so_lock, wakeup,
Rafael Vanonid3d50732009-11-13 01:32:32 -08001563 TR_CLOCK_TICK);
Anders Persson41174432009-02-12 17:35:05 -08001564 }
1565
1566 /*
1567 * Since we are holding so_lock no new reader will come in, and the
1568 * protocol will not be able to enqueue data. So it's safe to walk
1569 * both rcv queues.
1570 */
1571 mp = so->so_rcv_q_head;
1572 if (mp != NULL) {
1573 size = msgdsize(so->so_rcv_q_head);
1574 for (; mp != NULL; mp = mp->b_next)
1575 count++;
1576 } else {
1577 /*
1578 * In case the processing list was empty, get the size of the
1579 * next msg in line.
1580 */
1581 size = msgdsize(so->so_rcv_head);
1582 }
1583
1584 for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1585 count++;
1586out:
1587 mutex_exit(&so->so_lock);
1588
1589 /*
1590 * Drop down from size_t to the "int" required by the
1591 * interface. Cap at INT_MAX.
1592 */
1593 retval = MIN(size, INT_MAX);
1594 if (so_copyout(&retval, (void *)arg, sizeof (retval),
1595 (mode & (int)FKIOCTL))) {
1596 return (EFAULT);
1597 } else {
1598 *rvalp = count;
1599 return (0);
1600 }
1601}
1602
1603/*
1604 * Process STREAM ioctls.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001605 *
1606 * Returns:
1607 * < 0 - ioctl was not handle
1608 * >= 0 - ioctl was handled, if > 0, then it is an errno
1609 */
1610int
1611socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1612 struct cred *cr, int32_t *rvalp)
1613{
Anders Persson41174432009-02-12 17:35:05 -08001614 int retval;
1615
1616 /* Only STREAM iotcls are handled here */
1617 if ((cmd & 0xffffff00U) != STR)
1618 return (-1);
1619
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001620 switch (cmd) {
Anders Persson41174432009-02-12 17:35:05 -08001621 case I_CANPUT:
1622 /*
1623 * We return an error for I_CANPUT so that isastream(3C) will
1624 * not report the socket as being a STREAM.
1625 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001626 return (EOPNOTSUPP);
Anders Persson41174432009-02-12 17:35:05 -08001627 case I_NREAD:
1628 /* Avoid doing a fallback for I_NREAD. */
1629 return (so_strioc_nread(so, arg, mode, rvalp));
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001630 case I_LOOK:
Anders Persson41174432009-02-12 17:35:05 -08001631 /* Avoid doing a fallback for I_LOOK. */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001632 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1633 (mode & (int)FKIOCTL))) {
1634 return (EFAULT);
1635 }
1636 return (0);
1637 default:
Anders Persson41174432009-02-12 17:35:05 -08001638 break;
1639 }
1640
1641 /*
1642 * Try to fall back to TPI, and if successful, reissue the ioctl.
1643 */
1644 if ((retval = so_tpi_fallback(so, cr)) == 0) {
1645 /* Reissue the ioctl */
1646 ASSERT(so->so_rcv_q_head == NULL);
1647 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1648 } else {
1649 return (retval);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001650 }
1651}
1652
Anders Persson2c632ad2009-10-21 19:52:57 -07001653/*
1654 * This is called for all socket types to verify that the buffer size is large
1655 * enough for the option, and if we can, handle the request as well. Most
1656 * options will be forwarded to the protocol.
1657 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001658int
1659socket_getopt_common(struct sonode *so, int level, int option_name,
Yu Xiangninga5adac42008-12-29 13:56:29 +08001660 void *optval, socklen_t *optlenp, int flags)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001661{
1662 if (level != SOL_SOCKET)
1663 return (-1);
1664
1665 switch (option_name) {
1666 case SO_ERROR:
1667 case SO_DOMAIN:
1668 case SO_TYPE:
1669 case SO_ACCEPTCONN: {
1670 int32_t value;
1671 socklen_t optlen = *optlenp;
1672
1673 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1674 return (EINVAL);
1675 }
1676
1677 switch (option_name) {
1678 case SO_ERROR:
1679 mutex_enter(&so->so_lock);
1680 value = sogeterr(so, B_TRUE);
1681 mutex_exit(&so->so_lock);
1682 break;
1683 case SO_DOMAIN:
1684 value = so->so_family;
1685 break;
1686 case SO_TYPE:
1687 value = so->so_type;
1688 break;
1689 case SO_ACCEPTCONN:
1690 if (so->so_state & SS_ACCEPTCONN)
1691 value = SO_ACCEPTCONN;
1692 else
1693 value = 0;
1694 break;
1695 }
1696
1697 bcopy(&value, optval, sizeof (value));
1698 *optlenp = sizeof (value);
1699
1700 return (0);
1701 }
1702 case SO_SNDTIMEO:
1703 case SO_RCVTIMEO: {
1704 clock_t value;
1705 socklen_t optlen = *optlenp;
shenjiane5083e82009-01-20 14:46:11 +08001706
1707 if (get_udatamodel() == DATAMODEL_NONE ||
1708 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001709 if (optlen < sizeof (struct timeval))
1710 return (EINVAL);
1711 } else {
1712 if (optlen < sizeof (struct timeval32))
1713 return (EINVAL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001714 }
1715 if (option_name == SO_RCVTIMEO)
1716 value = drv_hztousec(so->so_rcvtimeo);
1717 else
1718 value = drv_hztousec(so->so_sndtimeo);
shenjian22238f72009-01-07 13:45:08 +08001719
shenjiane5083e82009-01-20 14:46:11 +08001720 if (get_udatamodel() == DATAMODEL_NONE ||
1721 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001722 ((struct timeval *)(optval))->tv_sec =
1723 value / (1000 * 1000);
1724 ((struct timeval *)(optval))->tv_usec =
1725 value % (1000 * 1000);
1726 *optlenp = sizeof (struct timeval);
1727 } else {
1728 ((struct timeval32 *)(optval))->tv_sec =
1729 value / (1000 * 1000);
1730 ((struct timeval32 *)(optval))->tv_usec =
1731 value % (1000 * 1000);
1732 *optlenp = sizeof (struct timeval32);
1733 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001734 return (0);
1735 }
1736 case SO_DEBUG:
1737 case SO_REUSEADDR:
1738 case SO_KEEPALIVE:
1739 case SO_DONTROUTE:
1740 case SO_BROADCAST:
1741 case SO_USELOOPBACK:
1742 case SO_OOBINLINE:
1743 case SO_SNDBUF:
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001744#ifdef notyet
1745 case SO_SNDLOWAT:
1746 case SO_RCVLOWAT:
1747#endif /* notyet */
1748 case SO_DGRAM_ERRIND: {
1749 socklen_t optlen = *optlenp;
1750
1751 if (optlen < (t_uscalar_t)sizeof (int32_t))
1752 return (EINVAL);
1753 break;
1754 }
Yu Xiangninga5adac42008-12-29 13:56:29 +08001755 case SO_RCVBUF: {
1756 socklen_t optlen = *optlenp;
1757
1758 if (optlen < (t_uscalar_t)sizeof (int32_t))
1759 return (EINVAL);
1760
1761 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1762 /*
1763 * XXX If SO_RCVBUF has been set and this is an
1764 * XPG 4.2 application then do not ask the transport
1765 * since the transport might adjust the value and not
1766 * return exactly what was set by the application.
1767 * For non-XPG 4.2 application we return the value
1768 * that the transport is actually using.
1769 */
1770 *(int32_t *)optval = so->so_xpg_rcvbuf;
1771 *optlenp = sizeof (so->so_xpg_rcvbuf);
1772 return (0);
1773 }
1774 /*
1775 * If the option has not been set then get a default
1776 * value from the transport.
1777 */
1778 break;
1779 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001780 case SO_LINGER: {
1781 socklen_t optlen = *optlenp;
1782
1783 if (optlen < (t_uscalar_t)sizeof (struct linger))
1784 return (EINVAL);
1785 break;
1786 }
1787 case SO_SND_BUFINFO: {
1788 socklen_t optlen = *optlenp;
1789
1790 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1791 return (EINVAL);
1792 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1793 (so->so_proto_props).sopp_wroff;
1794 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1795 (so->so_proto_props).sopp_maxblk;
1796 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1797 (so->so_proto_props).sopp_maxpsz;
1798 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1799 (so->so_proto_props).sopp_tail;
1800 *optlenp = sizeof (struct so_snd_bufinfo);
1801 return (0);
1802 }
Anders Persson3e95bd42010-06-17 17:22:09 -07001803 case SO_SND_COPYAVOID: {
1804 sof_instance_t *inst;
1805
1806 /*
1807 * Avoid zero-copy if there is a filter with a data_out
1808 * callback. We could let the operation succeed, but then
1809 * the filter would have to copy the data anyway.
1810 */
1811 for (inst = so->so_filter_top; inst != NULL;
1812 inst = inst->sofi_next) {
1813 if (SOF_INTERESTED(inst, data_out))
1814 return (EOPNOTSUPP);
1815 }
1816 break;
1817 }
1818
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001819 default:
1820 break;
1821 }
1822
1823 /* Unknown Option */
1824 return (-1);
1825}
1826
1827void
1828socket_sonode_destroy(struct sonode *so)
1829{
1830 sonode_fini(so);
1831 kmem_cache_free(socket_cache, so);
1832}
1833
1834int
1835so_zcopy_wait(struct sonode *so)
1836{
1837 int error = 0;
1838
1839 mutex_enter(&so->so_lock);
1840 while (!(so->so_copyflag & STZCNOTIFY)) {
1841 if (so->so_state & SS_CLOSING) {
1842 mutex_exit(&so->so_lock);
1843 return (EINTR);
1844 }
1845 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1846 error = EINTR;
1847 break;
1848 }
1849 }
1850 so->so_copyflag &= ~STZCNOTIFY;
1851 mutex_exit(&so->so_lock);
1852 return (error);
1853}
1854
1855void
1856so_timer_callback(void *arg)
1857{
1858 struct sonode *so = (struct sonode *)arg;
1859
1860 mutex_enter(&so->so_lock);
1861
1862 so->so_rcv_timer_tid = 0;
1863 if (so->so_rcv_queued > 0) {
1864 so_notify_data(so, so->so_rcv_queued);
1865 } else {
1866 mutex_exit(&so->so_lock);
1867 }
1868}
1869
1870#ifdef DEBUG
1871/*
1872 * Verify that the length stored in so_rcv_queued and the length of data blocks
1873 * queued is same.
1874 */
1875static boolean_t
1876so_check_length(sonode_t *so)
1877{
1878 mblk_t *mp = so->so_rcv_q_head;
1879 int len = 0;
1880
1881 ASSERT(MUTEX_HELD(&so->so_lock));
1882
1883 if (mp != NULL) {
1884 len = msgdsize(mp);
1885 while ((mp = mp->b_next) != NULL)
1886 len += msgdsize(mp);
1887 }
1888 mp = so->so_rcv_head;
1889 if (mp != NULL) {
1890 len += msgdsize(mp);
1891 while ((mp = mp->b_next) != NULL)
1892 len += msgdsize(mp);
1893 }
1894 return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1895}
1896#endif
1897
1898int
1899so_get_mod_version(struct sockparams *sp)
1900{
1901 ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1902 return (sp->sp_smod_info->smod_version);
1903}
1904
1905/*
1906 * so_start_fallback()
1907 *
1908 * Block new socket operations from coming in, and wait for active operations
1909 * to complete. Threads that are sleeping will be woken up so they can get
1910 * out of the way.
1911 *
1912 * The caller must be a reader on so_fallback_rwlock.
1913 */
1914static boolean_t
1915so_start_fallback(struct sonode *so)
1916{
1917 ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1918
1919 mutex_enter(&so->so_lock);
1920 if (so->so_state & SS_FALLBACK_PENDING) {
1921 mutex_exit(&so->so_lock);
1922 return (B_FALSE);
1923 }
1924 so->so_state |= SS_FALLBACK_PENDING;
1925 /*
1926 * Poke all threads that might be sleeping. Any operation that comes
1927 * in after the cv_broadcast will observe the fallback pending flag
1928 * which cause the call to return where it would normally sleep.
1929 */
1930 cv_broadcast(&so->so_state_cv); /* threads in connect() */
1931 cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
1932 cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
1933 mutex_enter(&so->so_acceptq_lock);
1934 cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
1935 mutex_exit(&so->so_acceptq_lock);
1936 mutex_exit(&so->so_lock);
1937
1938 /*
1939 * The main reason for the rw_tryupgrade call is to provide
1940 * observability during the fallback process. We want to
1941 * be able to see if there are pending operations.
1942 */
1943 if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1944 /*
1945 * It is safe to drop and reaquire the fallback lock, because
1946 * we are guaranteed that another fallback cannot take place.
1947 */
1948 rw_exit(&so->so_fallback_rwlock);
1949 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1950 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1951 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1952 }
1953
1954 return (B_TRUE);
1955}
1956
1957/*
1958 * so_end_fallback()
1959 *
1960 * Allow socket opertions back in.
1961 *
1962 * The caller must be a writer on so_fallback_rwlock.
1963 */
1964static void
1965so_end_fallback(struct sonode *so)
1966{
1967 ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1968
1969 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001970 so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001971 mutex_exit(&so->so_lock);
1972
1973 rw_downgrade(&so->so_fallback_rwlock);
1974}
1975
1976/*
1977 * so_quiesced_cb()
1978 *
1979 * Callback passed to the protocol during fallback. It is called once
1980 * the endpoint is quiescent.
1981 *
1982 * No requests from the user, no notifications from the protocol, so it
1983 * is safe to synchronize the state. Data can also be moved without
1984 * risk for reordering.
1985 *
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001986 * We do not need to hold so_lock, since there can be only one thread
1987 * operating on the sonode.
1988 */
Anders Persson3e95bd42010-06-17 17:22:09 -07001989static mblk_t *
1990so_quiesced_cb(sock_upper_handle_t sock_handle, sock_quiesce_arg_t *arg,
1991 struct T_capability_ack *tcap,
1992 struct sockaddr *laddr, socklen_t laddrlen,
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001993 struct sockaddr *faddr, socklen_t faddrlen, short opts)
1994{
1995 struct sonode *so = (struct sonode *)sock_handle;
Anders Persson41174432009-02-12 17:35:05 -08001996 boolean_t atmark;
Anders Persson3e95bd42010-06-17 17:22:09 -07001997 mblk_t *retmp = NULL, **tailmpp = &retmp;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001998
Anders Persson3e95bd42010-06-17 17:22:09 -07001999 if (tcap != NULL)
2000 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen,
2001 opts);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002002
Anders Persson41174432009-02-12 17:35:05 -08002003 /*
2004 * Some protocols do not quiece the data path during fallback. Once
2005 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
2006 * fail and the protocol is responsible for saving the data for later
2007 * delivery (i.e., once the fallback has completed).
2008 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002009 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08002010 so->so_state |= SS_FALLBACK_DRAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002011 SOCKET_TIMER_CANCEL(so);
2012 mutex_exit(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08002013
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002014 if (so->so_rcv_head != NULL) {
2015 if (so->so_rcv_q_last_head == NULL)
2016 so->so_rcv_q_head = so->so_rcv_head;
2017 else
2018 so->so_rcv_q_last_head->b_next = so->so_rcv_head;
2019 so->so_rcv_q_last_head = so->so_rcv_last_head;
2020 }
2021
Anders Persson41174432009-02-12 17:35:05 -08002022 atmark = (so->so_state & SS_RCVATMARK) != 0;
2023 /*
2024 * Clear any OOB state having to do with pending data. The TPI
2025 * code path will set the appropriate oob state when we move the
2026 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
2027 * data has already been consumed.
2028 */
2029 so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
2030
2031 ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
2032
2033 /*
2034 * Move data to the STREAM head.
2035 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002036 while (so->so_rcv_q_head != NULL) {
2037 mblk_t *mp = so->so_rcv_q_head;
2038 size_t mlen = msgdsize(mp);
2039
2040 so->so_rcv_q_head = mp->b_next;
2041 mp->b_next = NULL;
2042 mp->b_prev = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002043
2044 /*
2045 * Send T_EXDATA_IND if we are at the oob mark.
2046 */
2047 if (atmark) {
2048 struct T_exdata_ind *tei;
Anders Persson3e95bd42010-06-17 17:22:09 -07002049 mblk_t *mp1 = arg->soqa_exdata_mp;
Anders Persson41174432009-02-12 17:35:05 -08002050
Anders Persson3e95bd42010-06-17 17:22:09 -07002051 arg->soqa_exdata_mp = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002052 ASSERT(mp1 != NULL);
2053 mp1->b_datap->db_type = M_PROTO;
2054 tei = (struct T_exdata_ind *)mp1->b_rptr;
2055 tei->PRIM_type = T_EXDATA_IND;
2056 tei->MORE_flag = 0;
2057 mp1->b_wptr = (uchar_t *)&tei[1];
2058
2059 if (IS_SO_OOB_INLINE(so)) {
2060 mp1->b_cont = mp;
2061 } else {
2062 ASSERT(so->so_oobmsg != NULL);
2063 mp1->b_cont = so->so_oobmsg;
2064 so->so_oobmsg = NULL;
2065
2066 /* process current mp next time around */
2067 mp->b_next = so->so_rcv_q_head;
2068 so->so_rcv_q_head = mp;
2069 mlen = 0;
2070 }
2071 mp = mp1;
2072
2073 /* we have consumed the oob mark */
2074 atmark = B_FALSE;
2075 } else if (so->so_oobmark > 0) {
2076 /*
2077 * Check if the OOB mark is within the current
2078 * mblk chain. In that case we have to split it up.
2079 */
2080 if (so->so_oobmark < mlen) {
2081 mblk_t *urg_mp = mp;
2082
2083 atmark = B_TRUE;
2084 mp = NULL;
2085 mlen = so->so_oobmark;
2086
2087 /*
2088 * It is assumed that the OOB mark does
2089 * not land within a mblk.
2090 */
2091 do {
2092 so->so_oobmark -= MBLKL(urg_mp);
2093 mp = urg_mp;
2094 urg_mp = urg_mp->b_cont;
2095 } while (so->so_oobmark > 0);
2096 mp->b_cont = NULL;
2097 if (urg_mp != NULL) {
2098 urg_mp->b_next = so->so_rcv_q_head;
2099 so->so_rcv_q_head = urg_mp;
2100 }
2101 } else {
2102 so->so_oobmark -= mlen;
2103 if (so->so_oobmark == 0)
2104 atmark = B_TRUE;
2105 }
2106 }
2107
2108 /*
2109 * Queue data on the STREAM head.
2110 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002111 so->so_rcv_queued -= mlen;
Anders Persson3e95bd42010-06-17 17:22:09 -07002112 *tailmpp = mp;
2113 tailmpp = &mp->b_next;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002114 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002115 so->so_rcv_head = NULL;
2116 so->so_rcv_last_head = NULL;
2117 so->so_rcv_q_head = NULL;
2118 so->so_rcv_q_last_head = NULL;
2119
Anders Persson41174432009-02-12 17:35:05 -08002120 /*
2121 * Check if the oob byte is at the end of the data stream, or if the
2122 * oob byte has not yet arrived. In the latter case we have to send a
2123 * SIGURG and a mark indicator to the STREAM head. The mark indicator
2124 * is needed to guarantee correct behavior for SIOCATMARK. See block
2125 * comment in socktpi.h for more details.
2126 */
2127 if (atmark || so->so_oobmark > 0) {
2128 mblk_t *mp;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002129
Anders Persson41174432009-02-12 17:35:05 -08002130 if (atmark && so->so_oobmsg != NULL) {
2131 struct T_exdata_ind *tei;
2132
Anders Persson3e95bd42010-06-17 17:22:09 -07002133 mp = arg->soqa_exdata_mp;
2134 arg->soqa_exdata_mp = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002135 ASSERT(mp != NULL);
2136 mp->b_datap->db_type = M_PROTO;
2137 tei = (struct T_exdata_ind *)mp->b_rptr;
2138 tei->PRIM_type = T_EXDATA_IND;
2139 tei->MORE_flag = 0;
2140 mp->b_wptr = (uchar_t *)&tei[1];
2141
2142 mp->b_cont = so->so_oobmsg;
2143 so->so_oobmsg = NULL;
2144
Anders Persson3e95bd42010-06-17 17:22:09 -07002145 *tailmpp = mp;
2146 tailmpp = &mp->b_next;
Anders Persson41174432009-02-12 17:35:05 -08002147 } else {
2148 /* Send up the signal */
Anders Persson3e95bd42010-06-17 17:22:09 -07002149 mp = arg->soqa_exdata_mp;
2150 arg->soqa_exdata_mp = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002151 ASSERT(mp != NULL);
2152 DB_TYPE(mp) = M_PCSIG;
2153 *mp->b_wptr++ = (uchar_t)SIGURG;
Anders Persson3e95bd42010-06-17 17:22:09 -07002154 *tailmpp = mp;
2155 tailmpp = &mp->b_next;
Anders Persson41174432009-02-12 17:35:05 -08002156
2157 /* Send up the mark indicator */
Anders Persson3e95bd42010-06-17 17:22:09 -07002158 mp = arg->soqa_urgmark_mp;
2159 arg->soqa_urgmark_mp = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002160 mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
Anders Persson3e95bd42010-06-17 17:22:09 -07002161 *tailmpp = mp;
2162 tailmpp = &mp->b_next;
Anders Persson41174432009-02-12 17:35:05 -08002163
2164 so->so_oobmark = 0;
2165 }
2166 }
Anders Persson41174432009-02-12 17:35:05 -08002167 ASSERT(so->so_oobmark == 0);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002168 ASSERT(so->so_rcv_queued == 0);
Anders Persson3e95bd42010-06-17 17:22:09 -07002169
2170 return (retmp);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002171}
2172
Anders Persson41174432009-02-12 17:35:05 -08002173#ifdef DEBUG
2174/*
2175 * Do an integrity check of the sonode. This should be done if a
2176 * fallback fails after sonode has initially been converted to use
2177 * TPI and subsequently have to be reverted.
2178 *
2179 * Failure to pass the integrity check will panic the system.
2180 */
2181void
2182so_integrity_check(struct sonode *cur, struct sonode *orig)
2183{
2184 VERIFY(cur->so_vnode == orig->so_vnode);
2185 VERIFY(cur->so_ops == orig->so_ops);
2186 /*
2187 * For so_state we can only VERIFY the state flags in CHECK_STATE.
2188 * The other state flags might be affected by a notification from the
2189 * protocol.
2190 */
2191#define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2192 SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2193 SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2194 VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2195 (orig->so_state & CHECK_STATE));
2196 VERIFY(cur->so_mode == orig->so_mode);
2197 VERIFY(cur->so_flag == orig->so_flag);
2198 VERIFY(cur->so_count == orig->so_count);
2199 /* Cannot VERIFY so_proto_connid; proto can update it */
2200 VERIFY(cur->so_sockparams == orig->so_sockparams);
2201 /* an error might have been recorded, but it can not be lost */
2202 VERIFY(cur->so_error != 0 || orig->so_error == 0);
2203 VERIFY(cur->so_family == orig->so_family);
2204 VERIFY(cur->so_type == orig->so_type);
2205 VERIFY(cur->so_protocol == orig->so_protocol);
2206 VERIFY(cur->so_version == orig->so_version);
2207 /* New conns might have arrived, but none should have been lost */
2208 VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
Anders Persson3e95bd42010-06-17 17:22:09 -07002209 VERIFY(list_head(&cur->so_acceptq_list) ==
2210 list_head(&orig->so_acceptq_list));
Anders Persson41174432009-02-12 17:35:05 -08002211 VERIFY(cur->so_backlog == orig->so_backlog);
2212 /* New OOB migth have arrived, but mark should not have been lost */
2213 VERIFY(cur->so_oobmark >= orig->so_oobmark);
2214 /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2215 VERIFY(cur->so_pgrp == orig->so_pgrp);
2216 VERIFY(cur->so_peercred == orig->so_peercred);
2217 VERIFY(cur->so_cpid == orig->so_cpid);
2218 VERIFY(cur->so_zoneid == orig->so_zoneid);
2219 /* New data migth have arrived, but none should have been lost */
2220 VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2221 VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2222 VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2223 VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2224 VERIFY(cur->so_downcalls == orig->so_downcalls);
2225 /* Cannot VERIFY so_proto_props; they can be updated by proto */
2226}
2227#endif
2228
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002229/*
2230 * so_tpi_fallback()
2231 *
Anders Persson41174432009-02-12 17:35:05 -08002232 * This is the fallback initation routine; things start here.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002233 *
2234 * Basic strategy:
2235 * o Block new socket operations from coming in
2236 * o Allocate/initate info needed by TPI
2237 * o Quiesce the connection, at which point we sync
2238 * state and move data
2239 * o Change operations (sonodeops) associated with the socket
2240 * o Unblock threads waiting for the fallback to finish
2241 */
2242int
2243so_tpi_fallback(struct sonode *so, struct cred *cr)
2244{
2245 int error;
2246 queue_t *q;
2247 struct sockparams *sp;
Anders Persson41174432009-02-12 17:35:05 -08002248 struct sockparams *newsp = NULL;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002249 so_proto_fallback_func_t fbfunc;
Anders Persson3e95bd42010-06-17 17:22:09 -07002250 const char *devpath;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002251 boolean_t direct;
Anders Persson41174432009-02-12 17:35:05 -08002252 struct sonode *nso;
Anders Persson3e95bd42010-06-17 17:22:09 -07002253 sock_quiesce_arg_t arg = { NULL, NULL };
Anders Persson41174432009-02-12 17:35:05 -08002254#ifdef DEBUG
2255 struct sonode origso;
2256#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002257 error = 0;
2258 sp = so->so_sockparams;
2259 fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2260
2261 /*
Anders Persson3e95bd42010-06-17 17:22:09 -07002262 * Cannot fallback if the socket has active filters
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002263 */
Anders Persson3e95bd42010-06-17 17:22:09 -07002264 if (so->so_filter_active > 0)
2265 return (EINVAL);
2266
2267 switch (so->so_family) {
2268 case AF_INET:
2269 devpath = sp->sp_smod_info->smod_fallback_devpath_v4;
2270 break;
2271 case AF_INET6:
2272 devpath = sp->sp_smod_info->smod_fallback_devpath_v6;
2273 break;
2274 default:
2275 return (EINVAL);
2276 }
2277
2278 /*
2279 * Fallback can only happen if the socket module has a TPI device
2280 * and fallback function.
2281 */
2282 if (devpath == NULL || fbfunc == NULL)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002283 return (EINVAL);
2284
2285 /*
2286 * Initiate fallback; upon success we know that no new requests
2287 * will come in from the user.
2288 */
2289 if (!so_start_fallback(so))
2290 return (EAGAIN);
Anders Persson41174432009-02-12 17:35:05 -08002291#ifdef DEBUG
2292 /*
2293 * Make a copy of the sonode in case we need to make an integrity
2294 * check later on.
2295 */
2296 bcopy(so, &origso, sizeof (*so));
2297#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002298
Anders Persson7d64f412009-02-11 15:38:45 -08002299 sp->sp_stats.sps_nfallback.value.ui64++;
2300
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002301 newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
Anders Persson3e95bd42010-06-17 17:22:09 -07002302 so->so_protocol, devpath, KM_SLEEP, &error);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002303 if (error != 0)
2304 goto out;
2305
2306 if (so->so_direct != NULL) {
2307 sodirect_t *sodp = so->so_direct;
Anders Perssonbbc000e2009-04-28 12:10:59 -07002308 mutex_enter(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002309
Anders Perssonbbc000e2009-04-28 12:10:59 -07002310 so->so_direct->sod_enabled = B_FALSE;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002311 so->so_state &= ~SS_SODIRECT;
2312 ASSERT(sodp->sod_uioafh == NULL);
Anders Perssonbbc000e2009-04-28 12:10:59 -07002313 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002314 }
2315
2316 /* Turn sonode into a TPI socket */
Anders Persson41174432009-02-12 17:35:05 -08002317 error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2318 if (error != 0)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002319 goto out;
Anders Persson3e95bd42010-06-17 17:22:09 -07002320 /*
2321 * When it comes to urgent data we have two cases to deal with;
2322 * (1) The oob byte has already arrived, or (2) the protocol has
2323 * notified that oob data is pending, but it has not yet arrived.
2324 *
2325 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
2326 * in the byte stream the oob byte is. For (2) we have to send a
2327 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
2328 * the oob byte will be the next byte from the protocol.
2329 *
2330 * So in the worst case we need two mblks, one for the signal, another
2331 * for mark indication. In that case we use the exdata_mp for the sig.
2332 */
2333 arg.soqa_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind),
2334 BPRI_MED, STR_NOSIG, NULL);
2335 arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002336
2337 /*
2338 * Now tell the protocol to start using TPI. so_quiesced_cb be
2339 * called once it's safe to synchronize state.
2340 */
2341 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
Anders Persson3e95bd42010-06-17 17:22:09 -07002342 error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb,
2343 &arg);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002344 DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2345
Anders Persson41174432009-02-12 17:35:05 -08002346 if (error != 0) {
2347 /* protocol was unable to do a fallback, revert the sonode */
2348 sotpi_revert_sonode(so, cr);
2349 goto out;
2350 }
2351
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002352 /*
Anders Persson41174432009-02-12 17:35:05 -08002353 * Walk the accept queue and notify the proto that they should
2354 * fall back to TPI. The protocol will send up the T_CONN_IND.
2355 */
Anders Persson3e95bd42010-06-17 17:22:09 -07002356 nso = list_head(&so->so_acceptq_list);
Anders Persson41174432009-02-12 17:35:05 -08002357 while (nso != NULL) {
2358 int rval;
Anders Persson3e95bd42010-06-17 17:22:09 -07002359 struct sonode *next;
2360
2361 if (arg.soqa_exdata_mp == NULL) {
2362 arg.soqa_exdata_mp =
2363 allocb_wait(sizeof (struct T_exdata_ind),
2364 BPRI_MED, STR_NOSIG, NULL);
2365 }
2366 if (arg.soqa_urgmark_mp == NULL) {
2367 arg.soqa_urgmark_mp = allocb_wait(0, BPRI_MED,
2368 STR_NOSIG, NULL);
2369 }
Anders Persson41174432009-02-12 17:35:05 -08002370
2371 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
Anders Persson3e95bd42010-06-17 17:22:09 -07002372 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct,
2373 so_quiesced_cb, &arg);
Anders Persson41174432009-02-12 17:35:05 -08002374 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2375 if (rval != 0) {
Anders Persson3e95bd42010-06-17 17:22:09 -07002376 /* Abort the connection */
Anders Persson41174432009-02-12 17:35:05 -08002377 zcmn_err(getzoneid(), CE_WARN,
2378 "Failed to convert socket in accept queue to TPI. "
2379 "Pid = %d\n", curproc->p_pid);
Anders Persson3e95bd42010-06-17 17:22:09 -07002380 next = list_next(&so->so_acceptq_list, nso);
2381 list_remove(&so->so_acceptq_list, nso);
2382 so->so_acceptq_len--;
2383
2384 (void) socket_close(nso, 0, CRED());
2385 socket_destroy(nso);
2386 nso = next;
2387 } else {
2388 nso = list_next(&so->so_acceptq_list, nso);
Anders Persson41174432009-02-12 17:35:05 -08002389 }
Anders Persson41174432009-02-12 17:35:05 -08002390 }
2391
2392 /*
2393 * Now flush the acceptq, this will destroy all sockets. They will
2394 * be recreated in sotpi_accept().
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002395 */
Anders Persson2320a8c2009-10-21 19:52:57 -07002396 so_acceptq_flush(so, B_FALSE);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002397
2398 mutex_enter(&so->so_lock);
2399 so->so_state |= SS_FALLBACK_COMP;
2400 mutex_exit(&so->so_lock);
2401
2402 /*
2403 * Swap the sonode ops. Socket opertations that come in once this
2404 * is done will proceed without blocking.
2405 */
2406 so->so_ops = &sotpi_sonodeops;
2407
2408 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002409 * Wake up any threads stuck in poll. This is needed since the poll
2410 * head changes when the fallback happens (moves from the sonode to
2411 * the STREAMS head).
2412 */
2413 pollwakeup(&so->so_poll_list, POLLERR);
Anders Persson3e95bd42010-06-17 17:22:09 -07002414
2415 /*
2416 * When this non-STREAM socket was created we placed an extra ref on
2417 * the associated vnode to support asynchronous close. Drop that ref
2418 * here.
2419 */
2420 ASSERT(SOTOV(so)->v_count >= 2);
2421 VN_RELE(SOTOV(so));
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002422out:
2423 so_end_fallback(so);
2424
Anders Persson41174432009-02-12 17:35:05 -08002425 if (error != 0) {
2426#ifdef DEBUG
2427 so_integrity_check(so, &origso);
2428#endif
2429 zcmn_err(getzoneid(), CE_WARN,
2430 "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2431 error, curproc->p_pid);
2432 if (newsp != NULL)
2433 SOCKPARAMS_DEC_REF(newsp);
2434 }
Anders Persson3e95bd42010-06-17 17:22:09 -07002435 if (arg.soqa_exdata_mp != NULL)
2436 freemsg(arg.soqa_exdata_mp);
2437 if (arg.soqa_urgmark_mp != NULL)
2438 freemsg(arg.soqa_urgmark_mp);
Anders Persson41174432009-02-12 17:35:05 -08002439
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002440 return (error);
2441}