blob: 7a585410b31c87e790108921285fee6db779871d [file] [log] [blame]
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Rao Shoaibbfcb55b2009-01-05 10:51:43 -080023 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Yu Xiangning0f1702c2008-12-11 20:04:13 -080024 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/signal.h>
30#include <sys/cmn_err.h>
31
32#include <sys/stropts.h>
33#include <sys/socket.h>
34#include <sys/socketvar.h>
35#include <sys/sockio.h>
36#include <sys/sodirect.h>
37#include <sys/strsubr.h>
38#include <sys/strsun.h>
39#include <sys/atomic.h>
Anders Persson41174432009-02-12 17:35:05 -080040#include <sys/tihdr.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080041
42#include <fs/sockfs/sockcommon.h>
43#include <fs/sockfs/socktpi.h>
44#include <sys/ddi.h>
45#include <inet/ip.h>
46#include <sys/time.h>
47#include <sys/cmn_err.h>
48
49#ifdef SOCK_TEST
50extern int do_useracc;
51extern clock_t sock_test_timelimit;
52#endif /* SOCK_TEST */
53
54#define MBLK_PULL_LEN 64
55uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
56
57#ifdef DEBUG
58boolean_t so_debug_length = B_FALSE;
59static boolean_t so_check_length(sonode_t *so);
60#endif
61
62int
63so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
64{
65 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
66 ASSERT(nso->so_acceptq_next == NULL);
67
68 *so->so_acceptq_tail = nso;
69 so->so_acceptq_tail = &nso->so_acceptq_next;
70 so->so_acceptq_len++;
71
72 /* Wakeup a single consumer */
73 cv_signal(&so->so_acceptq_cv);
74
75 return (so->so_acceptq_len);
76}
77
78/*
79 * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
80 *
81 * Enqueue an incoming connection on a listening socket.
82 *
83 * Arguments:
84 * so - listening socket
85 * nso - new connection
86 *
87 * Returns:
88 * Number of queued connections, including the new connection
89 */
90int
91so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
92{
93 int conns;
94
95 mutex_enter(&so->so_acceptq_lock);
96 conns = so_acceptq_enqueue_locked(so, nso);
97 mutex_exit(&so->so_acceptq_lock);
98
99 return (conns);
100}
101
102static int
103so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
104 struct sonode **nsop)
105{
106 struct sonode *nso = NULL;
107
108 *nsop = NULL;
109 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
110 while ((nso = so->so_acceptq_head) == NULL) {
111 /*
112 * No need to check so_error here, because it is not
113 * possible for a listening socket to be reset or otherwise
114 * disconnected.
115 *
116 * So now we just need check if it's ok to wait.
117 */
118 if (dontblock)
119 return (EWOULDBLOCK);
120 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
121 return (EINTR);
122
123 if (cv_wait_sig_swap(&so->so_acceptq_cv,
124 &so->so_acceptq_lock) == 0)
125 return (EINTR);
126 }
127
128 ASSERT(nso != NULL);
129 so->so_acceptq_head = nso->so_acceptq_next;
130 nso->so_acceptq_next = NULL;
131
132 if (so->so_acceptq_head == NULL) {
133 ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
134 so->so_acceptq_tail = &so->so_acceptq_head;
135 }
136 ASSERT(so->so_acceptq_len > 0);
137 --so->so_acceptq_len;
138
139 *nsop = nso;
140
141 return (0);
142}
143
144/*
145 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
146 *
147 * Pulls a connection off of the accept queue.
148 *
149 * Arguments:
150 * so - listening socket
151 * dontblock - indicate whether it's ok to sleep if there are no
152 * connections on the queue
153 * nsop - Value-return argument
154 *
155 * Return values:
156 * 0 when a connection is successfully dequeued, in which case nsop
157 * is set to point to the new connection. Upon failure a non-zero
158 * value is returned, and the value of nsop is set to NULL.
159 *
160 * Note:
161 * so_acceptq_dequeue() may return prematurly if the socket is falling
162 * back to TPI.
163 */
164int
165so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
166 struct sonode **nsop)
167{
168 int error;
169
170 mutex_enter(&so->so_acceptq_lock);
171 error = so_acceptq_dequeue_locked(so, dontblock, nsop);
172 mutex_exit(&so->so_acceptq_lock);
173
174 return (error);
175}
176
177/*
178 * void so_acceptq_flush(struct sonode *so)
179 *
180 * Removes all pending connections from a listening socket, and
181 * frees the associated resources.
182 *
183 * Arguments
184 * so - listening socket
185 *
186 * Return values:
187 * None.
188 *
189 * Note:
190 * The caller has to ensure that no calls to so_acceptq_enqueue() or
191 * so_acceptq_dequeue() occur while the accept queue is being flushed.
192 * So either the socket needs to be in a state where no operations
193 * would come in, or so_lock needs to be obtained.
194 */
195void
196so_acceptq_flush(struct sonode *so)
197{
198 struct sonode *nso;
199
200 nso = so->so_acceptq_head;
201
202 while (nso != NULL) {
203 struct sonode *nnso = NULL;
204
205 nnso = nso->so_acceptq_next;
206 nso->so_acceptq_next = NULL;
207 /*
208 * Since the socket is on the accept queue, there can
209 * only be one reference. We drop the reference and
210 * just blow off the socket.
211 */
212 ASSERT(nso->so_count == 1);
213 nso->so_count--;
214 socket_destroy(nso);
215 nso = nnso;
216 }
217
218 so->so_acceptq_head = NULL;
219 so->so_acceptq_tail = &so->so_acceptq_head;
220 so->so_acceptq_len = 0;
221}
222
223int
224so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
225 sock_connid_t id)
226{
227 ASSERT(MUTEX_HELD(&so->so_lock));
228
229 /*
230 * The protocol has notified us that a connection attempt is being
231 * made, so before we wait for a notification to arrive we must
232 * clear out any errors associated with earlier connection attempts.
233 */
234 if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
235 so->so_error = 0;
236
237 while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
238 if (nonblock)
239 return (EINPROGRESS);
240
241 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
242 return (EINTR);
243
244 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
245 return (EINTR);
246 }
247
248 if (so->so_error != 0)
249 return (sogeterr(so, B_TRUE));
250 /*
251 * Under normal circumstances, so_error should contain an error
252 * in case the connect failed. However, it is possible for another
253 * thread to come in a consume the error, so generate a sensible
254 * error in that case.
255 */
256 if ((so->so_state & SS_ISCONNECTED) == 0)
257 return (ECONNREFUSED);
258
259 return (0);
260}
261
262/*
263 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
264 * sock_connid_t id)
265 *
266 * Wait until the socket is connected or an error has occured.
267 *
268 * Arguments:
269 * so - socket
270 * nonblock - indicate whether it's ok to sleep if the connection has
271 * not yet been established
272 * gen - generation number that was returned by the protocol
273 * when the operation was started
274 *
275 * Returns:
276 * 0 if the connection attempt was successful, or an error indicating why
277 * the connection attempt failed.
278 */
279int
280so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
281{
282 int error;
283
284 mutex_enter(&so->so_lock);
285 error = so_wait_connected_locked(so, nonblock, id);
286 mutex_exit(&so->so_lock);
287
288 return (error);
289}
290
291int
292so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
293{
294 int error;
295
296 ASSERT(MUTEX_HELD(&so->so_lock));
297 while (so->so_snd_qfull) {
298 if (so->so_state & SS_CANTSENDMORE)
299 return (EPIPE);
300 if (dontblock)
301 return (EWOULDBLOCK);
302
303 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
304 return (EINTR);
305
306 if (so->so_sndtimeo == 0) {
307 /*
308 * Zero means disable timeout.
309 */
310 error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
311 } else {
312 clock_t now;
313
314 time_to_wait(&now, so->so_sndtimeo);
315 error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
316 now);
317 }
318 if (error == 0)
319 return (EINTR);
320 else if (error == -1)
shenjian34dfe682009-01-21 10:04:42 +0800321 return (EAGAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800322 }
323 return (0);
324}
325
326/*
327 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
328 *
329 * Wait for the transport to notify us about send buffers becoming
330 * available.
331 */
332int
333so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
334{
335 int error = 0;
336
337 mutex_enter(&so->so_lock);
338 if (so->so_snd_qfull) {
339 so->so_snd_wakeup = B_TRUE;
340 error = so_snd_wait_qnotfull_locked(so, dontblock);
341 so->so_snd_wakeup = B_FALSE;
342 }
343 mutex_exit(&so->so_lock);
344
345 return (error);
346}
347
348void
349so_snd_qfull(struct sonode *so)
350{
351 mutex_enter(&so->so_lock);
352 so->so_snd_qfull = B_TRUE;
353 mutex_exit(&so->so_lock);
354}
355
356void
357so_snd_qnotfull(struct sonode *so)
358{
359 mutex_enter(&so->so_lock);
360 so->so_snd_qfull = B_FALSE;
361 /* wake up everyone waiting for buffers */
362 cv_broadcast(&so->so_snd_cv);
363 mutex_exit(&so->so_lock);
364}
365
366/*
367 * Change the process/process group to which SIGIO is sent.
368 */
369int
370socket_chgpgrp(struct sonode *so, pid_t pid)
371{
372 int error;
373
374 ASSERT(MUTEX_HELD(&so->so_lock));
375 if (pid != 0) {
376 /*
377 * Permissions check by sending signal 0.
378 * Note that when kill fails it does a
379 * set_errno causing the system call to fail.
380 */
381 error = kill(pid, 0);
382 if (error != 0) {
383 return (error);
384 }
385 }
386 so->so_pgrp = pid;
387 return (0);
388}
389
390
391/*
392 * Generate a SIGIO, for 'writable' events include siginfo structure,
393 * for read events just send the signal.
394 */
395/*ARGSUSED*/
396static void
397socket_sigproc(proc_t *proc, int event)
398{
399 k_siginfo_t info;
400
401 ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
402
403 if (event & SOCKETSIG_WRITE) {
404 info.si_signo = SIGPOLL;
405 info.si_code = POLL_OUT;
406 info.si_errno = 0;
407 info.si_fd = 0;
408 info.si_band = 0;
409 sigaddq(proc, NULL, &info, KM_NOSLEEP);
410 }
411 if (event & SOCKETSIG_READ) {
412 sigtoproc(proc, NULL, SIGPOLL);
413 }
414 if (event & SOCKETSIG_URG) {
415 sigtoproc(proc, NULL, SIGURG);
416 }
417}
418
419void
420socket_sendsig(struct sonode *so, int event)
421{
422 proc_t *proc;
423
424 ASSERT(MUTEX_HELD(&so->so_lock));
425
426 if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
427 event != SOCKETSIG_URG)) {
428 return;
429 }
430
431 dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
432
433 if (so->so_pgrp > 0) {
434 /*
435 * XXX This unfortunately still generates
436 * a signal when a fd is closed but
437 * the proc is active.
438 */
439 mutex_enter(&pidlock);
440 proc = prfind(so->so_pgrp);
441 if (proc == NULL) {
442 mutex_exit(&pidlock);
443 return;
444 }
445 mutex_enter(&proc->p_lock);
446 mutex_exit(&pidlock);
447 socket_sigproc(proc, event);
448 mutex_exit(&proc->p_lock);
449 } else {
450 /*
451 * Send to process group. Hold pidlock across
452 * calls to socket_sigproc().
453 */
454 pid_t pgrp = -so->so_pgrp;
455
456 mutex_enter(&pidlock);
457 proc = pgfind(pgrp);
458 while (proc != NULL) {
459 mutex_enter(&proc->p_lock);
460 socket_sigproc(proc, event);
461 mutex_exit(&proc->p_lock);
462 proc = proc->p_pglink;
463 }
464 mutex_exit(&pidlock);
465 }
466}
467
468#define MIN(a, b) ((a) < (b) ? (a) : (b))
469/* Copy userdata into a new mblk_t */
470mblk_t *
471socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800472 size_t tail_len, int *errorp, cred_t *cr)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800473{
474 mblk_t *head = NULL, **tail = &head;
475
476 ASSERT(iosize == INFPSZ || iosize > 0);
477
478 if (iosize == INFPSZ || iosize > uiop->uio_resid)
479 iosize = uiop->uio_resid;
480
481 if (maxblk == INFPSZ)
482 maxblk = iosize;
483
484 /* Nothing to do in these cases, so we're done */
485 if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
486 goto done;
487
488 /*
489 * We will enter the loop below if iosize is 0; it will allocate an
490 * empty message block and call uiomove(9F) which will just return.
491 * We could avoid that with an extra check but would only slow
492 * down the much more likely case where iosize is larger than 0.
493 */
494 do {
495 ssize_t blocksize;
496 mblk_t *mp;
497
498 blocksize = MIN(iosize, maxblk);
499 ASSERT(blocksize >= 0);
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800500 if (is_system_labeled())
501 mp = allocb_cred(wroff + blocksize + tail_len,
502 cr, curproc->p_pid);
503 else
504 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
505 if (mp == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800506 *errorp = ENOMEM;
507 return (head);
508 }
509 mp->b_rptr += wroff;
510 mp->b_wptr = mp->b_rptr + blocksize;
511
512 *tail = mp;
513 tail = &mp->b_cont;
514
515 /* uiomove(9F) either returns 0 or EFAULT */
516 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
517 UIO_WRITE, uiop)) != 0) {
518 ASSERT(*errorp != ENOMEM);
519 freemsg(head);
520 return (NULL);
521 }
522
523 iosize -= blocksize;
524 } while (iosize > 0);
525
526done:
527 *errorp = 0;
528 return (head);
529}
530
531mblk_t *
532socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
533{
534 int error;
535 ptrdiff_t n;
536 mblk_t *nmp;
537
538 ASSERT(mp->b_wptr >= mp->b_rptr);
539
540 /*
541 * max_read is the offset of the oobmark and read can not go pass
542 * the oobmark.
543 */
544 if (max_read == INFPSZ || max_read > uiop->uio_resid)
545 max_read = uiop->uio_resid;
546
547 do {
548 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
549 ASSERT(n > 0);
550
551 error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
552 if (error != 0) {
553 freemsg(mp);
554 *errorp = error;
555 return (NULL);
556 }
557 }
558
559 mp->b_rptr += n;
560 max_read -= n;
561 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
562 /*
563 * get rid of zero length mblks
564 */
565 nmp = mp;
566 mp = mp->b_cont;
567 freeb(nmp);
568 }
569 } while (mp != NULL && max_read > 0);
570
571 *errorp = 0;
572 return (mp);
573}
574
575static void
576so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
577{
578 ASSERT(last_tail != NULL);
579 mp->b_next = so->so_rcv_q_head;
580 mp->b_prev = last_tail;
581 ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
582
583 if (so->so_rcv_q_head == NULL) {
584 ASSERT(so->so_rcv_q_last_head == NULL);
585 so->so_rcv_q_last_head = mp;
586#ifdef DEBUG
587 } else {
588 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
589#endif
590 }
591 so->so_rcv_q_head = mp;
592
593#ifdef DEBUG
594 if (so_debug_length) {
595 mutex_enter(&so->so_lock);
596 ASSERT(so_check_length(so));
597 mutex_exit(&so->so_lock);
598 }
599#endif
600}
601
602static void
603process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
604{
605 ASSERT(mp_head->b_prev != NULL);
606 if (so->so_rcv_q_head == NULL) {
607 so->so_rcv_q_head = mp_head;
608 so->so_rcv_q_last_head = mp_last_head;
609 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
610 } else {
611 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
612 (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
613
614 if (mp_head->b_next == NULL &&
615 DB_TYPE(mp_head) == M_DATA &&
616 DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
617 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
618 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
619 mp_head->b_prev = NULL;
620 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
621 /*
622 * Append to last_head if more than one mblks, and both
623 * mp_head and last_head are I/OAT mblks.
624 */
625 ASSERT(mp_head->b_next != NULL);
626 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
627 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
628 mp_head->b_prev = NULL;
629
630 so->so_rcv_q_last_head->b_next = mp_head->b_next;
631 mp_head->b_next = NULL;
632 so->so_rcv_q_last_head = mp_last_head;
633 } else {
634#ifdef DEBUG
635 {
636 mblk_t *tmp_mblk;
637 tmp_mblk = mp_head;
638 while (tmp_mblk != NULL) {
639 ASSERT(tmp_mblk->b_prev != NULL);
640 tmp_mblk = tmp_mblk->b_next;
641 }
642 }
643#endif
644 so->so_rcv_q_last_head->b_next = mp_head;
645 so->so_rcv_q_last_head = mp_last_head;
646 }
647 }
648}
649
650int
651so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
652 rval_t *rvalp, int flags)
653{
654 mblk_t *mp, *nmp;
655 mblk_t *savemp, *savemptail;
656 mblk_t *new_msg_head;
657 mblk_t *new_msg_last_head;
658 mblk_t *last_tail;
659 boolean_t partial_read;
660 boolean_t reset_atmark = B_FALSE;
661 int more = 0;
662 int error;
663 ssize_t oobmark;
664 sodirect_t *sodp = so->so_direct;
665
666 partial_read = B_FALSE;
667 *mctlp = NULL;
668again:
669 mutex_enter(&so->so_lock);
670again1:
671#ifdef DEBUG
672 if (so_debug_length) {
673 ASSERT(so_check_length(so));
674 }
675#endif
676 /*
677 * First move messages from the dump area to processing area
678 */
679 if (sodp != NULL) {
680 /* No need to grab sod_lockp since it pointers to so_lock */
681 if (sodp->sod_state & SOD_ENABLED) {
682 ASSERT(sodp->sod_lockp == &so->so_lock);
683
684 if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
685 /* nothing to uioamove */
686 sodp = NULL;
687 } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
688 sodp->sod_uioa.uioa_state &= UIOA_CLR;
689 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
690 /*
691 * try to uioamove() the data that
692 * has already queued.
693 */
694 sod_uioa_so_init(so, sodp, uiop);
695 }
696 } else {
697 sodp = NULL;
698 }
699 }
700 new_msg_head = so->so_rcv_head;
701 new_msg_last_head = so->so_rcv_last_head;
702 so->so_rcv_head = NULL;
703 so->so_rcv_last_head = NULL;
704 oobmark = so->so_oobmark;
705 /*
706 * We can release the lock as there can only be one reader
707 */
708 mutex_exit(&so->so_lock);
709
710 if (so->so_state & SS_RCVATMARK) {
711 reset_atmark = B_TRUE;
712 }
713 if (new_msg_head != NULL) {
714 process_new_message(so, new_msg_head, new_msg_last_head);
715 }
716 savemp = savemptail = NULL;
717 rvalp->r_val1 = 0;
718 error = 0;
719 mp = so->so_rcv_q_head;
720
721 if (mp != NULL &&
722 (so->so_rcv_timer_tid == 0 ||
723 so->so_rcv_queued >= so->so_rcv_thresh)) {
724 partial_read = B_FALSE;
725
726 if (flags & MSG_PEEK) {
727 if ((nmp = dupmsg(mp)) == NULL &&
728 (nmp = copymsg(mp)) == NULL) {
729 size_t size = msgsize(mp);
730
731 error = strwaitbuf(size, BPRI_HI);
732 if (error) {
733 return (error);
734 }
735 goto again;
736 }
737 mp = nmp;
738 } else {
739 ASSERT(mp->b_prev != NULL);
740 last_tail = mp->b_prev;
741 mp->b_prev = NULL;
742 so->so_rcv_q_head = mp->b_next;
743 if (so->so_rcv_q_head == NULL) {
744 so->so_rcv_q_last_head = NULL;
745 }
746 mp->b_next = NULL;
747 }
748
749 ASSERT(mctlp != NULL);
750 /*
751 * First process PROTO or PCPROTO blocks, if any.
752 */
753 if (DB_TYPE(mp) != M_DATA) {
754 *mctlp = mp;
755 savemp = mp;
756 savemptail = mp;
757 ASSERT(DB_TYPE(mp) == M_PROTO ||
758 DB_TYPE(mp) == M_PCPROTO);
759 while (mp->b_cont != NULL &&
760 DB_TYPE(mp->b_cont) != M_DATA) {
761 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
762 DB_TYPE(mp->b_cont) == M_PCPROTO);
763 mp = mp->b_cont;
764 savemptail = mp;
765 }
766 mp = savemptail->b_cont;
767 savemptail->b_cont = NULL;
768 }
769
770 ASSERT(DB_TYPE(mp) == M_DATA);
771 /*
772 * Now process DATA blocks, if any. Note that for sodirect
773 * enabled socket, uio_resid can be 0.
774 */
775 if (uiop->uio_resid >= 0) {
776 ssize_t copied = 0;
777
778 if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
779 mutex_enter(sodp->sod_lockp);
780 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
781 copied = sod_uioa_mblk(so, mp);
782 if (copied > 0)
783 partial_read = B_TRUE;
784 mutex_exit(sodp->sod_lockp);
785 /* mark this mblk as processed */
786 mp = NULL;
787 } else {
788 ssize_t oldresid = uiop->uio_resid;
789
790 if (MBLKL(mp) < so_mblk_pull_len) {
791 if (pullupmsg(mp, -1) == 1) {
792 last_tail = mp;
793 }
794 }
795 /*
796 * Can not read beyond the oobmark
797 */
798 mp = socopyoutuio(mp, uiop,
799 oobmark == 0 ? INFPSZ : oobmark, &error);
800 if (error != 0) {
801 freemsg(*mctlp);
802 *mctlp = NULL;
803 more = 0;
804 goto done;
805 }
806 ASSERT(oldresid >= uiop->uio_resid);
807 copied = oldresid - uiop->uio_resid;
808 if (oldresid > uiop->uio_resid)
809 partial_read = B_TRUE;
810 }
811 ASSERT(copied >= 0);
812 if (copied > 0 && !(flags & MSG_PEEK)) {
813 mutex_enter(&so->so_lock);
814 so->so_rcv_queued -= copied;
815 ASSERT(so->so_oobmark >= 0);
816 if (so->so_oobmark > 0) {
817 so->so_oobmark -= copied;
818 ASSERT(so->so_oobmark >= 0);
819 if (so->so_oobmark == 0) {
820 ASSERT(so->so_state &
821 SS_OOBPEND);
822 so->so_oobmark = 0;
823 so->so_state |= SS_RCVATMARK;
824 }
825 }
826 if (so->so_flowctrld && so->so_rcv_queued <
827 so->so_rcvlowat) {
828 so->so_flowctrld = B_FALSE;
829 mutex_exit(&so->so_lock);
830 /*
Anders Persson419dcee2009-02-28 20:06:52 -0800831 * Open up flow control. SCTP does
832 * not have any downcalls, and it will
833 * clr flow ctrl in sosctp_recvmsg().
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800834 */
Anders Persson419dcee2009-02-28 20:06:52 -0800835 if (so->so_downcalls != NULL &&
836 so->so_downcalls->sd_clr_flowctrl !=
837 NULL) {
838 (*so->so_downcalls->
839 sd_clr_flowctrl)
840 (so->so_proto_handle);
841 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800842 } else {
843 mutex_exit(&so->so_lock);
844 }
845 }
846 }
847 if (mp != NULL) { /* more data blocks in msg */
848 more |= MOREDATA;
849 if ((flags & (MSG_PEEK|MSG_TRUNC))) {
andersf0267582008-12-20 22:46:32 -0800850 if (flags & MSG_TRUNC &&
851 ((flags & MSG_PEEK) == 0)) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800852 mutex_enter(&so->so_lock);
853 so->so_rcv_queued -= msgdsize(mp);
854 mutex_exit(&so->so_lock);
855 }
856 freemsg(mp);
857 } else if (partial_read && !somsghasdata(mp)) {
858 /*
859 * Avoid queuing a zero-length tail part of
860 * a message. partial_read == 1 indicates that
861 * we read some of the message.
862 */
863 freemsg(mp);
864 more &= ~MOREDATA;
865 } else {
866 if (savemp != NULL &&
867 (flags & MSG_DUPCTRL)) {
868 mblk_t *nmp;
869 /*
870 * There should only be non data mblks
871 */
872 ASSERT(DB_TYPE(savemp) != M_DATA &&
873 DB_TYPE(savemptail) != M_DATA);
874try_again:
875 if ((nmp = dupmsg(savemp)) == NULL &&
876 (nmp = copymsg(savemp)) == NULL) {
877
878 size_t size = msgsize(savemp);
879
880 error = strwaitbuf(size,
881 BPRI_HI);
882 if (error != 0) {
883 /*
884 * In case we
885 * cannot copy
886 * control data
887 * free the remaining
888 * data.
889 */
890 freemsg(mp);
891 goto done;
892 }
893 goto try_again;
894 }
895
896 ASSERT(nmp != NULL);
897 ASSERT(DB_TYPE(nmp) != M_DATA);
898 savemptail->b_cont = mp;
899 *mctlp = nmp;
900 mp = savemp;
901 }
902 /*
903 * putback mp
904 */
905 so_prepend_msg(so, mp, last_tail);
906 }
907 }
908
909 /* fast check so_rcv_head if there is more data */
910 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
911 *mctlp == NULL && uiop->uio_resid > 0 &&
912 !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
913 goto again;
914 }
915 } else if (!partial_read) {
916 mutex_enter(&so->so_lock);
917 if (so->so_error != 0) {
918 error = sogeterr(so, !(flags & MSG_PEEK));
919 mutex_exit(&so->so_lock);
920 return (error);
921 }
922 /*
923 * No pending data. Return right away for nonblocking
924 * socket, otherwise sleep waiting for data.
925 */
Mike Cheng2caa6592008-12-29 14:01:03 +0800926 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800927 if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
928 (flags & MSG_DONTWAIT)) {
929 error = EWOULDBLOCK;
930 } else {
931 if (so->so_state & (SS_CLOSING |
932 SS_FALLBACK_PENDING)) {
933 mutex_exit(&so->so_lock);
934 error = EINTR;
935 goto done;
936 }
937
938 if (so->so_rcv_head != NULL) {
939 goto again1;
940 }
941 so->so_rcv_wakeup = B_TRUE;
942 so->so_rcv_wanted = uiop->uio_resid;
943 if (so->so_rcvtimeo == 0) {
944 /*
945 * Zero means disable timeout.
946 */
947 error = cv_wait_sig(&so->so_rcv_cv,
948 &so->so_lock);
949 } else {
950 clock_t now;
951 time_to_wait(&now, so->so_rcvtimeo);
952 error = cv_timedwait_sig(&so->so_rcv_cv,
953 &so->so_lock, now);
954 }
955 so->so_rcv_wakeup = B_FALSE;
956 so->so_rcv_wanted = 0;
957
958 if (error == 0) {
959 error = EINTR;
960 } else if (error == -1) {
shenjian34dfe682009-01-21 10:04:42 +0800961 error = EAGAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800962 } else {
963 goto again1;
964 }
965 }
966 }
967 mutex_exit(&so->so_lock);
968 }
969 if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
970 /*
971 * We are passed the mark, update state
972 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
973 * The draft Posix socket spec states that the mark should
974 * not be cleared when peeking. We follow the latter.
975 */
976 mutex_enter(&so->so_lock);
977 ASSERT(so_verify_oobstate(so));
978 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
979 freemsg(so->so_oobmsg);
980 so->so_oobmsg = NULL;
981 ASSERT(so_verify_oobstate(so));
982 mutex_exit(&so->so_lock);
983 }
984 ASSERT(so->so_rcv_wakeup == B_FALSE);
985done:
986 if (sodp != NULL) {
987 mutex_enter(sodp->sod_lockp);
988 if ((sodp->sod_state & SOD_ENABLED) &&
989 (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
990 SOD_UIOAFINI(sodp);
991 if (sodp->sod_uioa.uioa_mbytes > 0) {
992 ASSERT(so->so_rcv_q_head != NULL ||
993 so->so_rcv_head != NULL);
994 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
995 if (error == EWOULDBLOCK)
996 error = 0;
997 }
998 }
999 mutex_exit(sodp->sod_lockp);
1000 }
1001#ifdef DEBUG
1002 if (so_debug_length) {
1003 mutex_enter(&so->so_lock);
1004 ASSERT(so_check_length(so));
1005 mutex_exit(&so->so_lock);
1006 }
1007#endif
1008 rvalp->r_val1 = more;
1009 return (error);
1010}
1011
1012void
1013so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1014{
1015 ASSERT(MUTEX_HELD(&so->so_lock));
1016
1017#ifdef DEBUG
1018 if (so_debug_length) {
1019 ASSERT(so_check_length(so));
1020 }
1021#endif
1022 so->so_rcv_queued += msg_size;
1023
1024 if (so->so_rcv_head == NULL) {
1025 ASSERT(so->so_rcv_last_head == NULL);
1026 so->so_rcv_head = mp;
1027 so->so_rcv_last_head = mp;
1028 } else if ((DB_TYPE(mp) == M_DATA &&
1029 DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1030 ((DB_FLAGS(mp) & DBLK_UIOA) ==
1031 (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1032 /* Added to the end */
1033 ASSERT(so->so_rcv_last_head != NULL);
1034 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1035 so->so_rcv_last_head->b_prev->b_cont = mp;
1036 } else {
1037 /* Start a new end */
1038 so->so_rcv_last_head->b_next = mp;
1039 so->so_rcv_last_head = mp;
1040 }
1041 while (mp->b_cont != NULL)
1042 mp = mp->b_cont;
1043
1044 so->so_rcv_last_head->b_prev = mp;
1045#ifdef DEBUG
1046 if (so_debug_length) {
1047 ASSERT(so_check_length(so));
1048 }
1049#endif
1050}
1051
1052/*
1053 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1054 */
1055boolean_t
1056somsghasdata(mblk_t *mp)
1057{
1058 for (; mp; mp = mp->b_cont)
1059 if (mp->b_datap->db_type == M_DATA) {
1060 ASSERT(mp->b_wptr >= mp->b_rptr);
1061 if (mp->b_wptr > mp->b_rptr)
1062 return (B_TRUE);
1063 }
1064 return (B_FALSE);
1065}
1066
1067/*
1068 * Flush the read side of sockfs.
1069 *
1070 * The caller must be sure that a reader is not already active when the
1071 * buffer is being flushed.
1072 */
1073void
1074so_rcv_flush(struct sonode *so)
1075{
1076 mblk_t *mp;
1077
1078 ASSERT(MUTEX_HELD(&so->so_lock));
1079
1080 if (so->so_oobmsg != NULL) {
1081 freemsg(so->so_oobmsg);
1082 so->so_oobmsg = NULL;
1083 so->so_oobmark = 0;
1084 so->so_state &=
1085 ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1086 }
1087
1088 /*
1089 * Free messages sitting in the send and recv queue
1090 */
1091 while (so->so_rcv_q_head != NULL) {
1092 mp = so->so_rcv_q_head;
1093 so->so_rcv_q_head = mp->b_next;
1094 mp->b_next = mp->b_prev = NULL;
1095 freemsg(mp);
1096 }
1097 while (so->so_rcv_head != NULL) {
1098 mp = so->so_rcv_head;
1099 so->so_rcv_head = mp->b_next;
1100 mp->b_next = mp->b_prev = NULL;
1101 freemsg(mp);
1102 }
1103 so->so_rcv_queued = 0;
1104 so->so_rcv_q_head = NULL;
1105 so->so_rcv_q_last_head = NULL;
1106 so->so_rcv_head = NULL;
1107 so->so_rcv_last_head = NULL;
1108}
1109
1110/*
1111 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1112 */
1113int
1114sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1115 boolean_t oob_inline)
1116{
1117 mblk_t *mp, *nmp;
1118 int error;
1119
1120 dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1121 flags));
1122
1123 if (msg != NULL) {
1124 /*
1125 * There is never any oob data with addresses or control since
1126 * the T_EXDATA_IND does not carry any options.
1127 */
1128 msg->msg_controllen = 0;
1129 msg->msg_namelen = 0;
1130 msg->msg_flags = 0;
1131 }
1132
1133 mutex_enter(&so->so_lock);
1134 ASSERT(so_verify_oobstate(so));
1135 if (oob_inline ||
1136 (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1137 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1138 mutex_exit(&so->so_lock);
1139 return (EINVAL);
1140 }
1141 if (!(so->so_state & SS_HAVEOOBDATA)) {
1142 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1143 mutex_exit(&so->so_lock);
1144 return (EWOULDBLOCK);
1145 }
1146 ASSERT(so->so_oobmsg != NULL);
1147 mp = so->so_oobmsg;
1148 if (flags & MSG_PEEK) {
1149 /*
1150 * Since recv* can not return ENOBUFS we can not use dupmsg.
1151 * Instead we revert to the consolidation private
1152 * allocb_wait plus bcopy.
1153 */
1154 mblk_t *mp1;
1155
1156 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1157 ASSERT(mp1);
1158
1159 while (mp != NULL) {
1160 ssize_t size;
1161
1162 size = MBLKL(mp);
1163 bcopy(mp->b_rptr, mp1->b_wptr, size);
1164 mp1->b_wptr += size;
1165 ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1166 mp = mp->b_cont;
1167 }
1168 mp = mp1;
1169 } else {
1170 /*
1171 * Update the state indicating that the data has been consumed.
1172 * Keep SS_OOBPEND set until data is consumed past the mark.
1173 */
1174 so->so_oobmsg = NULL;
1175 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1176 }
1177 ASSERT(so_verify_oobstate(so));
1178 mutex_exit(&so->so_lock);
1179
1180 error = 0;
1181 nmp = mp;
1182 while (nmp != NULL && uiop->uio_resid > 0) {
1183 ssize_t n = MBLKL(nmp);
1184
1185 n = MIN(n, uiop->uio_resid);
1186 if (n > 0)
1187 error = uiomove(nmp->b_rptr, n,
1188 UIO_READ, uiop);
1189 if (error)
1190 break;
1191 nmp = nmp->b_cont;
1192 }
1193 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1194 freemsg(mp);
1195 return (error);
1196}
1197
1198/*
1199 * Allocate and initializ sonode
1200 */
1201/* ARGSUSED */
1202struct sonode *
1203socket_sonode_create(struct sockparams *sp, int family, int type,
1204 int protocol, int version, int sflags, int *errorp, struct cred *cr)
1205{
1206 sonode_t *so;
1207 int kmflags;
1208
1209 /*
1210 * Choose the right set of sonodeops based on the upcall and
1211 * down call version that the protocol has provided
1212 */
1213 if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1214 SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1215 /*
1216 * mismatch
1217 */
1218#ifdef DEBUG
1219 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1220#endif
1221 *errorp = EINVAL;
1222 return (NULL);
1223 }
1224
1225 kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1226
1227 so = kmem_cache_alloc(socket_cache, kmflags);
1228 if (so == NULL) {
1229 *errorp = ENOMEM;
1230 return (NULL);
1231 }
1232
1233 sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1234
1235 if (version == SOV_DEFAULT)
1236 version = so_default_version;
1237
1238 so->so_version = (short)version;
1239
1240 /*
1241 * set the default values to be INFPSZ
1242 * if a protocol desires it can change the value later
1243 */
1244 so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1245 so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1246 so->so_proto_props.sopp_maxpsz = INFPSZ;
1247 so->so_proto_props.sopp_maxblk = INFPSZ;
1248
1249 return (so);
1250}
1251
1252int
1253socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1254{
1255 int error = 0;
1256
1257 if (pso != NULL) {
1258 /*
1259 * We have a passive open, so inherit basic state from
1260 * the parent (listener).
1261 *
1262 * No need to grab the new sonode's lock, since there is no
1263 * one that can have a reference to it.
1264 */
1265 mutex_enter(&pso->so_lock);
1266
1267 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1268 so->so_pgrp = pso->so_pgrp;
1269 so->so_rcvtimeo = pso->so_rcvtimeo;
1270 so->so_sndtimeo = pso->so_sndtimeo;
Yu Xiangninga5adac42008-12-29 13:56:29 +08001271 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001272 /*
1273 * Make note of the socket level options. TCP and IP level
1274 * options are already inherited. We could do all this after
1275 * accept is successful but doing it here simplifies code and
1276 * no harm done for error case.
1277 */
1278 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
Yu Xiangninga5adac42008-12-29 13:56:29 +08001279 SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001280 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1281 so->so_proto_props = pso->so_proto_props;
1282 so->so_mode = pso->so_mode;
andersf0267582008-12-20 22:46:32 -08001283 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001284
1285 mutex_exit(&pso->so_lock);
1286
1287 if (uioasync.enabled) {
1288 sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
1289 }
1290 return (0);
1291 } else {
1292 struct sockparams *sp = so->so_sockparams;
1293 sock_upcalls_t *upcalls_to_use;
1294
1295 /*
1296 * Based on the version number select the right upcalls to
1297 * pass down. Currently we only have one version so choose
1298 * default
1299 */
1300 upcalls_to_use = &so_upcalls;
1301
1302 /* active open, so create a lower handle */
1303 so->so_proto_handle =
1304 sp->sp_smod_info->smod_proto_create_func(so->so_family,
1305 so->so_type, so->so_protocol, &so->so_downcalls,
1306 &so->so_mode, &error, flags, cr);
1307
1308 if (so->so_proto_handle == NULL) {
1309 ASSERT(error != 0);
1310 /*
1311 * To be safe; if a lower handle cannot be created, and
1312 * the proto does not give a reason why, assume there
1313 * was a lack of memory.
1314 */
1315 return ((error == 0) ? ENOMEM : error);
1316 }
1317 ASSERT(so->so_downcalls != NULL);
1318 ASSERT(so->so_downcalls->sd_send != NULL ||
1319 so->so_downcalls->sd_send_uio != NULL);
1320 if (so->so_downcalls->sd_recv_uio != NULL) {
1321 ASSERT(so->so_downcalls->sd_poll != NULL);
1322 so->so_pollev |= SO_POLLEV_ALWAYS;
1323 }
1324
1325 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1326 (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1327
1328 /* Wildcard */
1329
1330 /*
1331 * FIXME No need for this, the protocol can deal with it in
1332 * sd_create(). Should update ICMP.
1333 */
1334 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1335 int protocol = so->so_protocol;
1336 int error;
1337 /*
1338 * Issue SO_PROTOTYPE setsockopt.
1339 */
1340 error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1341 &protocol, (t_uscalar_t)sizeof (protocol), cr);
1342 if (error) {
1343 (void) (*so->so_downcalls->sd_close)
1344 (so->so_proto_handle, 0, cr);
1345
1346 mutex_enter(&so->so_lock);
1347 so_rcv_flush(so);
1348 mutex_exit(&so->so_lock);
1349 /*
1350 * Setsockopt often fails with ENOPROTOOPT but
1351 * socket() should fail with
1352 * EPROTONOSUPPORT/EPROTOTYPE.
1353 */
1354 return (EPROTONOSUPPORT);
1355 }
1356 }
1357 return (0);
1358 }
1359}
1360
1361/*
1362 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1363 * struct cred *cr, int32_t *rvalp)
1364 *
1365 * Handle ioctls that manipulate basic socket state; non-blocking,
1366 * async, etc.
1367 *
1368 * Returns:
1369 * < 0 - ioctl was not handle
1370 * >= 0 - ioctl was handled, if > 0, then it is an errno
1371 *
1372 * Notes:
1373 * Assumes the standard receive buffer is used to obtain info for
1374 * NREAD.
1375 */
1376/* ARGSUSED */
1377int
1378socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1379 struct cred *cr, int32_t *rvalp)
1380{
1381 switch (cmd) {
Rao Shoaibbfcb55b2009-01-05 10:51:43 -08001382 case SIOCSQPTR:
1383 /*
1384 * SIOCSQPTR is valid only when helper stream is created
1385 * by the protocol.
1386 */
1387
1388 return (EOPNOTSUPP);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001389 case FIONBIO: {
1390 int32_t value;
1391
1392 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1393 (mode & (int)FKIOCTL)))
1394 return (EFAULT);
1395
1396 mutex_enter(&so->so_lock);
1397 if (value) {
1398 so->so_state |= SS_NDELAY;
1399 } else {
1400 so->so_state &= ~SS_NDELAY;
1401 }
1402 mutex_exit(&so->so_lock);
1403 return (0);
1404 }
1405 case FIOASYNC: {
1406 int32_t value;
1407
1408 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1409 (mode & (int)FKIOCTL)))
1410 return (EFAULT);
1411
1412 mutex_enter(&so->so_lock);
1413
1414 if (value) {
1415 /* Turn on SIGIO */
1416 so->so_state |= SS_ASYNC;
1417 } else {
1418 /* Turn off SIGIO */
1419 so->so_state &= ~SS_ASYNC;
1420 }
1421 mutex_exit(&so->so_lock);
1422
1423 return (0);
1424 }
1425
1426 case SIOCSPGRP:
1427 case FIOSETOWN: {
1428 int error;
1429 pid_t pid;
1430
1431 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1432 (mode & (int)FKIOCTL)))
1433 return (EFAULT);
1434
1435 mutex_enter(&so->so_lock);
1436 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1437 mutex_exit(&so->so_lock);
1438 return (error);
1439 }
1440 case SIOCGPGRP:
1441 case FIOGETOWN:
1442 if (so_copyout(&so->so_pgrp, (void *)arg,
1443 sizeof (pid_t), (mode & (int)FKIOCTL)))
1444 return (EFAULT);
1445
1446 return (0);
1447 case SIOCATMARK: {
1448 int retval;
1449
1450 /*
1451 * Only protocols that support urgent data can handle ATMARK.
1452 */
1453 if ((so->so_mode & SM_EXDATA) == 0)
1454 return (EINVAL);
1455
1456 /*
1457 * If the protocol is maintaining its own buffer, then the
1458 * request must be passed down.
1459 */
1460 if (so->so_downcalls->sd_recv_uio != NULL)
1461 return (-1);
1462
1463 retval = (so->so_state & SS_RCVATMARK) != 0;
1464
1465 if (so_copyout(&retval, (void *)arg, sizeof (int),
1466 (mode & (int)FKIOCTL))) {
1467 return (EFAULT);
1468 }
1469 return (0);
1470 }
1471
1472 case FIONREAD: {
1473 int retval;
1474
1475 /*
1476 * If the protocol is maintaining its own buffer, then the
1477 * request must be passed down.
1478 */
1479 if (so->so_downcalls->sd_recv_uio != NULL)
1480 return (-1);
1481
1482 retval = MIN(so->so_rcv_queued, INT_MAX);
1483
1484 if (so_copyout(&retval, (void *)arg,
1485 sizeof (retval), (mode & (int)FKIOCTL))) {
1486 return (EFAULT);
1487 }
1488 return (0);
1489 }
1490
1491 case _I_GETPEERCRED: {
1492 int error = 0;
1493
1494 if ((mode & FKIOCTL) == 0)
1495 return (EINVAL);
1496
1497 mutex_enter(&so->so_lock);
1498 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1499 error = ENOTSUP;
1500 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1501 error = ENOTCONN;
1502 } else if (so->so_peercred != NULL) {
1503 k_peercred_t *kp = (k_peercred_t *)arg;
1504 kp->pc_cr = so->so_peercred;
1505 kp->pc_cpid = so->so_cpid;
1506 crhold(so->so_peercred);
1507 } else {
1508 error = EINVAL;
1509 }
1510 mutex_exit(&so->so_lock);
1511 return (error);
1512 }
1513 default:
1514 return (-1);
1515 }
1516}
1517
1518/*
Anders Persson41174432009-02-12 17:35:05 -08001519 * Handle the I_NREAD STREAM ioctl.
1520 */
1521static int
1522so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1523{
1524 size_t size = 0;
1525 int retval;
1526 int count = 0;
1527 mblk_t *mp;
1528
1529 if (so->so_downcalls == NULL ||
1530 so->so_downcalls->sd_recv_uio != NULL)
1531 return (EINVAL);
1532
1533 mutex_enter(&so->so_lock);
1534 /* Wait for reader to get out of the way. */
1535 while (so->so_flag & SOREADLOCKED) {
1536 /*
1537 * If reader is waiting for data, then there should be nothing
1538 * on the rcv queue.
1539 */
1540 if (so->so_rcv_wakeup)
1541 goto out;
1542
1543 so->so_flag |= SOWANT;
1544 /* Do a timed sleep, in case the reader goes to sleep. */
1545 (void) cv_timedwait(&so->so_state_cv, &so->so_lock,
1546 lbolt + drv_usectohz(10));
1547 }
1548
1549 /*
1550 * Since we are holding so_lock no new reader will come in, and the
1551 * protocol will not be able to enqueue data. So it's safe to walk
1552 * both rcv queues.
1553 */
1554 mp = so->so_rcv_q_head;
1555 if (mp != NULL) {
1556 size = msgdsize(so->so_rcv_q_head);
1557 for (; mp != NULL; mp = mp->b_next)
1558 count++;
1559 } else {
1560 /*
1561 * In case the processing list was empty, get the size of the
1562 * next msg in line.
1563 */
1564 size = msgdsize(so->so_rcv_head);
1565 }
1566
1567 for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1568 count++;
1569out:
1570 mutex_exit(&so->so_lock);
1571
1572 /*
1573 * Drop down from size_t to the "int" required by the
1574 * interface. Cap at INT_MAX.
1575 */
1576 retval = MIN(size, INT_MAX);
1577 if (so_copyout(&retval, (void *)arg, sizeof (retval),
1578 (mode & (int)FKIOCTL))) {
1579 return (EFAULT);
1580 } else {
1581 *rvalp = count;
1582 return (0);
1583 }
1584}
1585
1586/*
1587 * Process STREAM ioctls.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001588 *
1589 * Returns:
1590 * < 0 - ioctl was not handle
1591 * >= 0 - ioctl was handled, if > 0, then it is an errno
1592 */
1593int
1594socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1595 struct cred *cr, int32_t *rvalp)
1596{
Anders Persson41174432009-02-12 17:35:05 -08001597 int retval;
1598
1599 /* Only STREAM iotcls are handled here */
1600 if ((cmd & 0xffffff00U) != STR)
1601 return (-1);
1602
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001603 switch (cmd) {
Anders Persson41174432009-02-12 17:35:05 -08001604 case I_CANPUT:
1605 /*
1606 * We return an error for I_CANPUT so that isastream(3C) will
1607 * not report the socket as being a STREAM.
1608 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001609 return (EOPNOTSUPP);
Anders Persson41174432009-02-12 17:35:05 -08001610 case I_NREAD:
1611 /* Avoid doing a fallback for I_NREAD. */
1612 return (so_strioc_nread(so, arg, mode, rvalp));
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001613 case I_LOOK:
Anders Persson41174432009-02-12 17:35:05 -08001614 /* Avoid doing a fallback for I_LOOK. */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001615 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1616 (mode & (int)FKIOCTL))) {
1617 return (EFAULT);
1618 }
1619 return (0);
1620 default:
Anders Persson41174432009-02-12 17:35:05 -08001621 break;
1622 }
1623
1624 /*
1625 * Try to fall back to TPI, and if successful, reissue the ioctl.
1626 */
1627 if ((retval = so_tpi_fallback(so, cr)) == 0) {
1628 /* Reissue the ioctl */
1629 ASSERT(so->so_rcv_q_head == NULL);
1630 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1631 } else {
1632 return (retval);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001633 }
1634}
1635
1636int
1637socket_getopt_common(struct sonode *so, int level, int option_name,
Yu Xiangninga5adac42008-12-29 13:56:29 +08001638 void *optval, socklen_t *optlenp, int flags)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001639{
1640 if (level != SOL_SOCKET)
1641 return (-1);
1642
1643 switch (option_name) {
1644 case SO_ERROR:
1645 case SO_DOMAIN:
1646 case SO_TYPE:
1647 case SO_ACCEPTCONN: {
1648 int32_t value;
1649 socklen_t optlen = *optlenp;
1650
1651 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1652 return (EINVAL);
1653 }
1654
1655 switch (option_name) {
1656 case SO_ERROR:
1657 mutex_enter(&so->so_lock);
1658 value = sogeterr(so, B_TRUE);
1659 mutex_exit(&so->so_lock);
1660 break;
1661 case SO_DOMAIN:
1662 value = so->so_family;
1663 break;
1664 case SO_TYPE:
1665 value = so->so_type;
1666 break;
1667 case SO_ACCEPTCONN:
1668 if (so->so_state & SS_ACCEPTCONN)
1669 value = SO_ACCEPTCONN;
1670 else
1671 value = 0;
1672 break;
1673 }
1674
1675 bcopy(&value, optval, sizeof (value));
1676 *optlenp = sizeof (value);
1677
1678 return (0);
1679 }
1680 case SO_SNDTIMEO:
1681 case SO_RCVTIMEO: {
1682 clock_t value;
1683 socklen_t optlen = *optlenp;
shenjiane5083e82009-01-20 14:46:11 +08001684
1685 if (get_udatamodel() == DATAMODEL_NONE ||
1686 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001687 if (optlen < sizeof (struct timeval))
1688 return (EINVAL);
1689 } else {
1690 if (optlen < sizeof (struct timeval32))
1691 return (EINVAL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001692 }
1693 if (option_name == SO_RCVTIMEO)
1694 value = drv_hztousec(so->so_rcvtimeo);
1695 else
1696 value = drv_hztousec(so->so_sndtimeo);
shenjian22238f72009-01-07 13:45:08 +08001697
shenjiane5083e82009-01-20 14:46:11 +08001698 if (get_udatamodel() == DATAMODEL_NONE ||
1699 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001700 ((struct timeval *)(optval))->tv_sec =
1701 value / (1000 * 1000);
1702 ((struct timeval *)(optval))->tv_usec =
1703 value % (1000 * 1000);
1704 *optlenp = sizeof (struct timeval);
1705 } else {
1706 ((struct timeval32 *)(optval))->tv_sec =
1707 value / (1000 * 1000);
1708 ((struct timeval32 *)(optval))->tv_usec =
1709 value % (1000 * 1000);
1710 *optlenp = sizeof (struct timeval32);
1711 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001712 return (0);
1713 }
1714 case SO_DEBUG:
1715 case SO_REUSEADDR:
1716 case SO_KEEPALIVE:
1717 case SO_DONTROUTE:
1718 case SO_BROADCAST:
1719 case SO_USELOOPBACK:
1720 case SO_OOBINLINE:
1721 case SO_SNDBUF:
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001722#ifdef notyet
1723 case SO_SNDLOWAT:
1724 case SO_RCVLOWAT:
1725#endif /* notyet */
1726 case SO_DGRAM_ERRIND: {
1727 socklen_t optlen = *optlenp;
1728
1729 if (optlen < (t_uscalar_t)sizeof (int32_t))
1730 return (EINVAL);
1731 break;
1732 }
Yu Xiangninga5adac42008-12-29 13:56:29 +08001733 case SO_RCVBUF: {
1734 socklen_t optlen = *optlenp;
1735
1736 if (optlen < (t_uscalar_t)sizeof (int32_t))
1737 return (EINVAL);
1738
1739 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1740 /*
1741 * XXX If SO_RCVBUF has been set and this is an
1742 * XPG 4.2 application then do not ask the transport
1743 * since the transport might adjust the value and not
1744 * return exactly what was set by the application.
1745 * For non-XPG 4.2 application we return the value
1746 * that the transport is actually using.
1747 */
1748 *(int32_t *)optval = so->so_xpg_rcvbuf;
1749 *optlenp = sizeof (so->so_xpg_rcvbuf);
1750 return (0);
1751 }
1752 /*
1753 * If the option has not been set then get a default
1754 * value from the transport.
1755 */
1756 break;
1757 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001758 case SO_LINGER: {
1759 socklen_t optlen = *optlenp;
1760
1761 if (optlen < (t_uscalar_t)sizeof (struct linger))
1762 return (EINVAL);
1763 break;
1764 }
1765 case SO_SND_BUFINFO: {
1766 socklen_t optlen = *optlenp;
1767
1768 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1769 return (EINVAL);
1770 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1771 (so->so_proto_props).sopp_wroff;
1772 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1773 (so->so_proto_props).sopp_maxblk;
1774 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1775 (so->so_proto_props).sopp_maxpsz;
1776 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1777 (so->so_proto_props).sopp_tail;
1778 *optlenp = sizeof (struct so_snd_bufinfo);
1779 return (0);
1780 }
1781 default:
1782 break;
1783 }
1784
1785 /* Unknown Option */
1786 return (-1);
1787}
1788
1789void
1790socket_sonode_destroy(struct sonode *so)
1791{
1792 sonode_fini(so);
1793 kmem_cache_free(socket_cache, so);
1794}
1795
1796int
1797so_zcopy_wait(struct sonode *so)
1798{
1799 int error = 0;
1800
1801 mutex_enter(&so->so_lock);
1802 while (!(so->so_copyflag & STZCNOTIFY)) {
1803 if (so->so_state & SS_CLOSING) {
1804 mutex_exit(&so->so_lock);
1805 return (EINTR);
1806 }
1807 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1808 error = EINTR;
1809 break;
1810 }
1811 }
1812 so->so_copyflag &= ~STZCNOTIFY;
1813 mutex_exit(&so->so_lock);
1814 return (error);
1815}
1816
1817void
1818so_timer_callback(void *arg)
1819{
1820 struct sonode *so = (struct sonode *)arg;
1821
1822 mutex_enter(&so->so_lock);
1823
1824 so->so_rcv_timer_tid = 0;
1825 if (so->so_rcv_queued > 0) {
1826 so_notify_data(so, so->so_rcv_queued);
1827 } else {
1828 mutex_exit(&so->so_lock);
1829 }
1830}
1831
1832#ifdef DEBUG
1833/*
1834 * Verify that the length stored in so_rcv_queued and the length of data blocks
1835 * queued is same.
1836 */
1837static boolean_t
1838so_check_length(sonode_t *so)
1839{
1840 mblk_t *mp = so->so_rcv_q_head;
1841 int len = 0;
1842
1843 ASSERT(MUTEX_HELD(&so->so_lock));
1844
1845 if (mp != NULL) {
1846 len = msgdsize(mp);
1847 while ((mp = mp->b_next) != NULL)
1848 len += msgdsize(mp);
1849 }
1850 mp = so->so_rcv_head;
1851 if (mp != NULL) {
1852 len += msgdsize(mp);
1853 while ((mp = mp->b_next) != NULL)
1854 len += msgdsize(mp);
1855 }
1856 return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1857}
1858#endif
1859
1860int
1861so_get_mod_version(struct sockparams *sp)
1862{
1863 ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1864 return (sp->sp_smod_info->smod_version);
1865}
1866
1867/*
1868 * so_start_fallback()
1869 *
1870 * Block new socket operations from coming in, and wait for active operations
1871 * to complete. Threads that are sleeping will be woken up so they can get
1872 * out of the way.
1873 *
1874 * The caller must be a reader on so_fallback_rwlock.
1875 */
1876static boolean_t
1877so_start_fallback(struct sonode *so)
1878{
1879 ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1880
1881 mutex_enter(&so->so_lock);
1882 if (so->so_state & SS_FALLBACK_PENDING) {
1883 mutex_exit(&so->so_lock);
1884 return (B_FALSE);
1885 }
1886 so->so_state |= SS_FALLBACK_PENDING;
1887 /*
1888 * Poke all threads that might be sleeping. Any operation that comes
1889 * in after the cv_broadcast will observe the fallback pending flag
1890 * which cause the call to return where it would normally sleep.
1891 */
1892 cv_broadcast(&so->so_state_cv); /* threads in connect() */
1893 cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
1894 cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
1895 mutex_enter(&so->so_acceptq_lock);
1896 cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
1897 mutex_exit(&so->so_acceptq_lock);
1898 mutex_exit(&so->so_lock);
1899
1900 /*
1901 * The main reason for the rw_tryupgrade call is to provide
1902 * observability during the fallback process. We want to
1903 * be able to see if there are pending operations.
1904 */
1905 if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1906 /*
1907 * It is safe to drop and reaquire the fallback lock, because
1908 * we are guaranteed that another fallback cannot take place.
1909 */
1910 rw_exit(&so->so_fallback_rwlock);
1911 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1912 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1913 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1914 }
1915
1916 return (B_TRUE);
1917}
1918
1919/*
1920 * so_end_fallback()
1921 *
1922 * Allow socket opertions back in.
1923 *
1924 * The caller must be a writer on so_fallback_rwlock.
1925 */
1926static void
1927so_end_fallback(struct sonode *so)
1928{
1929 ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1930
1931 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001932 so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001933 mutex_exit(&so->so_lock);
1934
1935 rw_downgrade(&so->so_fallback_rwlock);
1936}
1937
1938/*
1939 * so_quiesced_cb()
1940 *
1941 * Callback passed to the protocol during fallback. It is called once
1942 * the endpoint is quiescent.
1943 *
1944 * No requests from the user, no notifications from the protocol, so it
1945 * is safe to synchronize the state. Data can also be moved without
1946 * risk for reordering.
1947 *
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001948 * We do not need to hold so_lock, since there can be only one thread
1949 * operating on the sonode.
1950 */
1951static void
1952so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1953 struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1954 struct sockaddr *faddr, socklen_t faddrlen, short opts)
1955{
1956 struct sonode *so = (struct sonode *)sock_handle;
Anders Persson41174432009-02-12 17:35:05 -08001957 boolean_t atmark;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001958
1959 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1960
Anders Persson41174432009-02-12 17:35:05 -08001961 /*
1962 * Some protocols do not quiece the data path during fallback. Once
1963 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
1964 * fail and the protocol is responsible for saving the data for later
1965 * delivery (i.e., once the fallback has completed).
1966 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001967 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001968 so->so_state |= SS_FALLBACK_DRAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001969 SOCKET_TIMER_CANCEL(so);
1970 mutex_exit(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001971
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001972 if (so->so_rcv_head != NULL) {
1973 if (so->so_rcv_q_last_head == NULL)
1974 so->so_rcv_q_head = so->so_rcv_head;
1975 else
1976 so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1977 so->so_rcv_q_last_head = so->so_rcv_last_head;
1978 }
1979
Anders Persson41174432009-02-12 17:35:05 -08001980 atmark = (so->so_state & SS_RCVATMARK) != 0;
1981 /*
1982 * Clear any OOB state having to do with pending data. The TPI
1983 * code path will set the appropriate oob state when we move the
1984 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
1985 * data has already been consumed.
1986 */
1987 so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
1988
1989 ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
1990
1991 /*
1992 * Move data to the STREAM head.
1993 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001994 while (so->so_rcv_q_head != NULL) {
1995 mblk_t *mp = so->so_rcv_q_head;
1996 size_t mlen = msgdsize(mp);
1997
1998 so->so_rcv_q_head = mp->b_next;
1999 mp->b_next = NULL;
2000 mp->b_prev = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002001
2002 /*
2003 * Send T_EXDATA_IND if we are at the oob mark.
2004 */
2005 if (atmark) {
2006 struct T_exdata_ind *tei;
2007 mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
2008
2009 SOTOTPI(so)->sti_exdata_mp = NULL;
2010 ASSERT(mp1 != NULL);
2011 mp1->b_datap->db_type = M_PROTO;
2012 tei = (struct T_exdata_ind *)mp1->b_rptr;
2013 tei->PRIM_type = T_EXDATA_IND;
2014 tei->MORE_flag = 0;
2015 mp1->b_wptr = (uchar_t *)&tei[1];
2016
2017 if (IS_SO_OOB_INLINE(so)) {
2018 mp1->b_cont = mp;
2019 } else {
2020 ASSERT(so->so_oobmsg != NULL);
2021 mp1->b_cont = so->so_oobmsg;
2022 so->so_oobmsg = NULL;
2023
2024 /* process current mp next time around */
2025 mp->b_next = so->so_rcv_q_head;
2026 so->so_rcv_q_head = mp;
2027 mlen = 0;
2028 }
2029 mp = mp1;
2030
2031 /* we have consumed the oob mark */
2032 atmark = B_FALSE;
2033 } else if (so->so_oobmark > 0) {
2034 /*
2035 * Check if the OOB mark is within the current
2036 * mblk chain. In that case we have to split it up.
2037 */
2038 if (so->so_oobmark < mlen) {
2039 mblk_t *urg_mp = mp;
2040
2041 atmark = B_TRUE;
2042 mp = NULL;
2043 mlen = so->so_oobmark;
2044
2045 /*
2046 * It is assumed that the OOB mark does
2047 * not land within a mblk.
2048 */
2049 do {
2050 so->so_oobmark -= MBLKL(urg_mp);
2051 mp = urg_mp;
2052 urg_mp = urg_mp->b_cont;
2053 } while (so->so_oobmark > 0);
2054 mp->b_cont = NULL;
2055 if (urg_mp != NULL) {
2056 urg_mp->b_next = so->so_rcv_q_head;
2057 so->so_rcv_q_head = urg_mp;
2058 }
2059 } else {
2060 so->so_oobmark -= mlen;
2061 if (so->so_oobmark == 0)
2062 atmark = B_TRUE;
2063 }
2064 }
2065
2066 /*
2067 * Queue data on the STREAM head.
2068 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002069 so->so_rcv_queued -= mlen;
2070 putnext(q, mp);
2071 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002072 so->so_rcv_head = NULL;
2073 so->so_rcv_last_head = NULL;
2074 so->so_rcv_q_head = NULL;
2075 so->so_rcv_q_last_head = NULL;
2076
Anders Persson41174432009-02-12 17:35:05 -08002077 /*
2078 * Check if the oob byte is at the end of the data stream, or if the
2079 * oob byte has not yet arrived. In the latter case we have to send a
2080 * SIGURG and a mark indicator to the STREAM head. The mark indicator
2081 * is needed to guarantee correct behavior for SIOCATMARK. See block
2082 * comment in socktpi.h for more details.
2083 */
2084 if (atmark || so->so_oobmark > 0) {
2085 mblk_t *mp;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002086
Anders Persson41174432009-02-12 17:35:05 -08002087 if (atmark && so->so_oobmsg != NULL) {
2088 struct T_exdata_ind *tei;
2089
2090 mp = SOTOTPI(so)->sti_exdata_mp;
2091 SOTOTPI(so)->sti_exdata_mp = NULL;
2092 ASSERT(mp != NULL);
2093 mp->b_datap->db_type = M_PROTO;
2094 tei = (struct T_exdata_ind *)mp->b_rptr;
2095 tei->PRIM_type = T_EXDATA_IND;
2096 tei->MORE_flag = 0;
2097 mp->b_wptr = (uchar_t *)&tei[1];
2098
2099 mp->b_cont = so->so_oobmsg;
2100 so->so_oobmsg = NULL;
2101
2102 putnext(q, mp);
2103 } else {
2104 /* Send up the signal */
2105 mp = SOTOTPI(so)->sti_exdata_mp;
2106 SOTOTPI(so)->sti_exdata_mp = NULL;
2107 ASSERT(mp != NULL);
2108 DB_TYPE(mp) = M_PCSIG;
2109 *mp->b_wptr++ = (uchar_t)SIGURG;
2110 putnext(q, mp);
2111
2112 /* Send up the mark indicator */
2113 mp = SOTOTPI(so)->sti_urgmark_mp;
2114 SOTOTPI(so)->sti_urgmark_mp = NULL;
2115 mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2116 putnext(q, mp);
2117
2118 so->so_oobmark = 0;
2119 }
2120 }
2121
2122 if (SOTOTPI(so)->sti_exdata_mp != NULL) {
2123 freeb(SOTOTPI(so)->sti_exdata_mp);
2124 SOTOTPI(so)->sti_exdata_mp = NULL;
2125 }
2126
2127 if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
2128 freeb(SOTOTPI(so)->sti_urgmark_mp);
2129 SOTOTPI(so)->sti_urgmark_mp = NULL;
2130 }
2131
2132 ASSERT(so->so_oobmark == 0);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002133 ASSERT(so->so_rcv_queued == 0);
2134}
2135
Anders Persson41174432009-02-12 17:35:05 -08002136#ifdef DEBUG
2137/*
2138 * Do an integrity check of the sonode. This should be done if a
2139 * fallback fails after sonode has initially been converted to use
2140 * TPI and subsequently have to be reverted.
2141 *
2142 * Failure to pass the integrity check will panic the system.
2143 */
2144void
2145so_integrity_check(struct sonode *cur, struct sonode *orig)
2146{
2147 VERIFY(cur->so_vnode == orig->so_vnode);
2148 VERIFY(cur->so_ops == orig->so_ops);
2149 /*
2150 * For so_state we can only VERIFY the state flags in CHECK_STATE.
2151 * The other state flags might be affected by a notification from the
2152 * protocol.
2153 */
2154#define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2155 SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2156 SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2157 VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2158 (orig->so_state & CHECK_STATE));
2159 VERIFY(cur->so_mode == orig->so_mode);
2160 VERIFY(cur->so_flag == orig->so_flag);
2161 VERIFY(cur->so_count == orig->so_count);
2162 /* Cannot VERIFY so_proto_connid; proto can update it */
2163 VERIFY(cur->so_sockparams == orig->so_sockparams);
2164 /* an error might have been recorded, but it can not be lost */
2165 VERIFY(cur->so_error != 0 || orig->so_error == 0);
2166 VERIFY(cur->so_family == orig->so_family);
2167 VERIFY(cur->so_type == orig->so_type);
2168 VERIFY(cur->so_protocol == orig->so_protocol);
2169 VERIFY(cur->so_version == orig->so_version);
2170 /* New conns might have arrived, but none should have been lost */
2171 VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2172 VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
2173 VERIFY(cur->so_backlog == orig->so_backlog);
2174 /* New OOB migth have arrived, but mark should not have been lost */
2175 VERIFY(cur->so_oobmark >= orig->so_oobmark);
2176 /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2177 VERIFY(cur->so_pgrp == orig->so_pgrp);
2178 VERIFY(cur->so_peercred == orig->so_peercred);
2179 VERIFY(cur->so_cpid == orig->so_cpid);
2180 VERIFY(cur->so_zoneid == orig->so_zoneid);
2181 /* New data migth have arrived, but none should have been lost */
2182 VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2183 VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2184 VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2185 VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2186 VERIFY(cur->so_downcalls == orig->so_downcalls);
2187 /* Cannot VERIFY so_proto_props; they can be updated by proto */
2188}
2189#endif
2190
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002191/*
2192 * so_tpi_fallback()
2193 *
Anders Persson41174432009-02-12 17:35:05 -08002194 * This is the fallback initation routine; things start here.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002195 *
2196 * Basic strategy:
2197 * o Block new socket operations from coming in
2198 * o Allocate/initate info needed by TPI
2199 * o Quiesce the connection, at which point we sync
2200 * state and move data
2201 * o Change operations (sonodeops) associated with the socket
2202 * o Unblock threads waiting for the fallback to finish
2203 */
2204int
2205so_tpi_fallback(struct sonode *so, struct cred *cr)
2206{
2207 int error;
2208 queue_t *q;
2209 struct sockparams *sp;
Anders Persson41174432009-02-12 17:35:05 -08002210 struct sockparams *newsp = NULL;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002211 so_proto_fallback_func_t fbfunc;
2212 boolean_t direct;
Anders Persson41174432009-02-12 17:35:05 -08002213 struct sonode *nso;
2214#ifdef DEBUG
2215 struct sonode origso;
2216#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002217 error = 0;
2218 sp = so->so_sockparams;
2219 fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2220
2221 /*
2222 * Fallback can only happen if there is a device associated
2223 * with the sonode, and the socket module has a fallback function.
2224 */
2225 if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
2226 return (EINVAL);
2227
2228 /*
2229 * Initiate fallback; upon success we know that no new requests
2230 * will come in from the user.
2231 */
2232 if (!so_start_fallback(so))
2233 return (EAGAIN);
Anders Persson41174432009-02-12 17:35:05 -08002234#ifdef DEBUG
2235 /*
2236 * Make a copy of the sonode in case we need to make an integrity
2237 * check later on.
2238 */
2239 bcopy(so, &origso, sizeof (*so));
2240#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002241
Anders Persson7d64f412009-02-11 15:38:45 -08002242 sp->sp_stats.sps_nfallback.value.ui64++;
2243
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002244 newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2245 so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
2246 KM_SLEEP, &error);
2247 if (error != 0)
2248 goto out;
2249
2250 if (so->so_direct != NULL) {
2251 sodirect_t *sodp = so->so_direct;
2252 mutex_enter(sodp->sod_lockp);
2253
2254 so->so_direct->sod_state &= ~SOD_ENABLED;
2255 so->so_state &= ~SS_SODIRECT;
2256 ASSERT(sodp->sod_uioafh == NULL);
2257 mutex_exit(sodp->sod_lockp);
2258 }
2259
2260 /* Turn sonode into a TPI socket */
Anders Persson41174432009-02-12 17:35:05 -08002261 error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2262 if (error != 0)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002263 goto out;
Anders Persson41174432009-02-12 17:35:05 -08002264
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002265
2266 /*
2267 * Now tell the protocol to start using TPI. so_quiesced_cb be
2268 * called once it's safe to synchronize state.
2269 */
2270 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
Anders Persson41174432009-02-12 17:35:05 -08002271 error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002272 DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2273
Anders Persson41174432009-02-12 17:35:05 -08002274 if (error != 0) {
2275 /* protocol was unable to do a fallback, revert the sonode */
2276 sotpi_revert_sonode(so, cr);
2277 goto out;
2278 }
2279
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002280 /*
Anders Persson41174432009-02-12 17:35:05 -08002281 * Walk the accept queue and notify the proto that they should
2282 * fall back to TPI. The protocol will send up the T_CONN_IND.
2283 */
2284 nso = so->so_acceptq_head;
2285 while (nso != NULL) {
2286 int rval;
2287
2288 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2289 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
2290 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2291 if (rval != 0) {
2292 zcmn_err(getzoneid(), CE_WARN,
2293 "Failed to convert socket in accept queue to TPI. "
2294 "Pid = %d\n", curproc->p_pid);
2295 }
2296 nso = nso->so_acceptq_next;
2297 }
2298
2299 /*
2300 * Now flush the acceptq, this will destroy all sockets. They will
2301 * be recreated in sotpi_accept().
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002302 */
2303 so_acceptq_flush(so);
2304
2305 mutex_enter(&so->so_lock);
2306 so->so_state |= SS_FALLBACK_COMP;
2307 mutex_exit(&so->so_lock);
2308
2309 /*
2310 * Swap the sonode ops. Socket opertations that come in once this
2311 * is done will proceed without blocking.
2312 */
2313 so->so_ops = &sotpi_sonodeops;
2314
2315 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002316 * Wake up any threads stuck in poll. This is needed since the poll
2317 * head changes when the fallback happens (moves from the sonode to
2318 * the STREAMS head).
2319 */
2320 pollwakeup(&so->so_poll_list, POLLERR);
2321out:
2322 so_end_fallback(so);
2323
Anders Persson41174432009-02-12 17:35:05 -08002324 if (error != 0) {
2325#ifdef DEBUG
2326 so_integrity_check(so, &origso);
2327#endif
2328 zcmn_err(getzoneid(), CE_WARN,
2329 "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2330 error, curproc->p_pid);
2331 if (newsp != NULL)
2332 SOCKPARAMS_DEC_REF(newsp);
2333 }
2334
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002335 return (error);
2336}